============================================================================
AWS Application Load Balancer (ALB) CloudWatch Alarms
============================================================================
Copy
# Local variables for ALBs and Target Groups
locals {
alb_configs = {
"AppNameDashboard-PROD-ALB" = {
alb_name = "AppNameDashboard-PROD-ALB"
alb_arn_suffix = "app/AppNameDashboard-PROD-ALB/f93ab370be314618"
target_groups = {
"AppNameDashboard-Prod-Primary-TG" = {
tg_name = "AppNameDashboard-Prod-Primary-TG"
tg_arn_suffix = "targetgroup/AppNameDashboard-Prod-Primary-TG/fb9ae754143996ce"
}
}
}
"AppName-PROD-ALB" = {
alb_name = "AppName-PROD-ALB"
alb_arn_suffix = "app/AppName-PROD-ALB/848a3cd3bb37bffd"
target_groups = {
"AppName-PROD-Primary-TG" = {
tg_name = "AppName-PROD-Primary-TG"
tg_arn_suffix = "targetgroup/AppName-PROD-Primary-TG/104dd21318466571"
}
}
}
"AppName-PROD-ALB" = {
alb_name = "AppName-PROD-ALB"
alb_arn_suffix = "app/AppName-PROD-ALB/264b7bbd12acfba7"
target_groups = {
"AppName-PROD-TG-Primary" = {
tg_name = "AppName-PROD-TG-Primary"
tg_arn_suffix = "targetgroup/AppName-PROD-TG-Primary/64453119eb44f29e"
}
}
}
"AppName-elb" = {
alb_name = "AppName-elb"
alb_arn_suffix = "app/AppName-elb/35d47d181f692f3d"
target_groups = {
"AppName-target-grp" = {
tg_name = "AppName-target-grp"
tg_arn_suffix = "targetgroup/AppName-target-grp/83efd5484608930d"
}
}
}
}
# Flatten structure for target group alarms
target_groups_flat = merge([
for alb_key, alb in local.alb_configs : {
for tg_key, tg in alb.target_groups : "${alb_key}_${tg_key}" => {
alb_name = alb.alb_name
alb_arn_suffix = alb.alb_arn_suffix
tg_name = tg.tg_name
tg_arn_suffix = tg.tg_arn_suffix
}
}
]...)
# Alarm thresholds - Set higher to reduce noise and only alert on real issues
tg_4xx_threshold = 100 # Number of 4xx errors at target group level (Sum over 5 mins)
tg_5xx_threshold = 50 # Number of 5xx errors at target group level (Sum over 5 mins)
alb_response_time_threshold = 5 # Response time in seconds at ALB level (Average)
tg_response_time_threshold = 5 # Response time in seconds at target group level (Average)
alb_rejected_connection_threshold = 25 # Number of rejected connections (Sum over 5 mins)
}
# ============================================================================
# ALB-LEVEL ALARMS
# ============================================================================
# CloudWatch Alarm for ALB Target Response Time
resource "aws_cloudwatch_metric_alarm" "alb_target_response_time" {
for_each = local.alb_configs
alarm_name = "CW-ALB-${each.value.alb_name}-TargetResponseTime"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "TargetResponseTime"
namespace = "AWS/ApplicationELB"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.alb_response_time_threshold
alarm_description = "This metric monitors target response time for ${each.value.alb_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
LoadBalancer = each.value.alb_arn_suffix
}
tags = {
Name = "CW-ALB-${each.value.alb_name}-TargetResponseTime"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# CloudWatch Alarm for ALB Rejected Connection Count
resource "aws_cloudwatch_metric_alarm" "alb_rejected_connections" {
for_each = local.alb_configs
alarm_name = "CW-ALB-${each.value.alb_name}-RejectedConnectionCount"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "RejectedConnectionCount"
namespace = "AWS/ApplicationELB"
period = "300" # 5 minutes
statistic = "Sum"
threshold = local.alb_rejected_connection_threshold
alarm_description = "This metric monitors rejected connections for ${each.value.alb_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
LoadBalancer = each.value.alb_arn_suffix
}
tags = {
Name = "CW-ALB-${each.value.alb_name}-RejectedConnectionCount"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# ============================================================================
# TARGET GROUP-LEVEL ALARMS
# ============================================================================
# CloudWatch Alarm for Target Group HTTP 4xx Errors
resource "aws_cloudwatch_metric_alarm" "tg_4xx_errors" {
for_each = local.target_groups_flat
alarm_name = "CW-TG-${each.value.tg_name}-HTTPCode4XX"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "HTTPCode_Target_4XX_Count"
namespace = "AWS/ApplicationELB"
period = "300" # 5 minutes
statistic = "Sum"
threshold = local.tg_4xx_threshold
alarm_description = "This metric monitors 4xx errors from target group ${each.value.tg_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
TargetGroup = each.value.tg_arn_suffix
LoadBalancer = each.value.alb_arn_suffix
}
tags = {
Name = "CW-TG-${each.value.tg_name}-HTTPCode4XX"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# CloudWatch Alarm for Target Group HTTP 5xx Errors
resource "aws_cloudwatch_metric_alarm" "tg_5xx_errors" {
for_each = local.target_groups_flat
alarm_name = "CW-TG-${each.value.tg_name}-HTTPCode5XX"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "HTTPCode_Target_5XX_Count"
namespace = "AWS/ApplicationELB"
period = "300" # 5 minutes
statistic = "Sum"
threshold = local.tg_5xx_threshold
alarm_description = "This metric monitors 5xx errors from target group ${each.value.tg_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
TargetGroup = each.value.tg_arn_suffix
LoadBalancer = each.value.alb_arn_suffix
}
tags = {
Name = "CW-TG-${each.value.tg_name}-HTTPCode5XX"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# CloudWatch Alarm for Target Group Response Time
resource "aws_cloudwatch_metric_alarm" "tg_response_time" {
for_each = local.target_groups_flat
alarm_name = "CW-TG-${each.value.tg_name}-TargetResponseTime"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "TargetResponseTime"
namespace = "AWS/ApplicationELB"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.tg_response_time_threshold
alarm_description = "This metric monitors target response time for target group ${each.value.tg_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
TargetGroup = each.value.tg_arn_suffix
LoadBalancer = each.value.alb_arn_suffix
}
tags = {
Name = "CW-TG-${each.value.tg_name}-TargetResponseTime"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# CloudWatch Alarm for Target Group - All Targets Unhealthy (No Healthy Hosts)
resource "aws_cloudwatch_metric_alarm" "tg_all_unhealthy" {
for_each = local.target_groups_flat
alarm_name = "CW-TG-${each.value.tg_name}-AllTargetsUnhealthy"
comparison_operator = "LessThanOrEqualToThreshold"
evaluation_periods = "2"
metric_name = "HealthyHostCount"
namespace = "AWS/ApplicationELB"
period = "300" # 5 minutes
statistic = "Minimum"
threshold = 0 # Alert if minimum healthy hosts is 0
alarm_description = "This metric alerts when all targets are unhealthy in target group ${each.value.tg_name}. Uses Minimum statistic to detect any moment with zero healthy hosts."
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "breaching"
dimensions = {
TargetGroup = each.value.tg_arn_suffix
LoadBalancer = each.value.alb_arn_suffix
}
tags = {
Name = "CW-TG-${each.value.tg_name}-AllTargetsUnhealthy"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# ============================================================================
# OUTPUTS
# ============================================================================
<h1 id="description-arns-of-alb-level-cloud-watch-alarms-value-response_time-for-k-v-in-aws_cloudwatch_metric_alarm-alb_target_response_time-k-v-arn">
output "alb_alarm_arns"
</h1>
# rejected_connections = { for k, v in aws_cloudwatch_metric_alarm.alb_rejected_connections : k => v.arn }
# }
# }
<h1 id="description-arns-of-target-group-level-cloud-watch-alarms-value-http_4xx_errors-for-k-v-in-aws_cloudwatch_metric_alarm-tg_4xx_errors-k-v-arn">
output "target_group_alarm_arns"
</h1>
# http_5xx_errors = { for k, v in aws_cloudwatch_metric_alarm.tg_5xx_errors : k => v.arn }
# response_time = { for k, v in aws_cloudwatch_metric_alarm.tg_response_time : k => v.arn }
# all_targets_unhealthy = { for k, v in aws_cloudwatch_metric_alarm.tg_all_unhealthy : k => v.arn }
# }
# }
