Skip to main content

============================================================================

AWS Application Load Balancer (ALB) CloudWatch Alarms

============================================================================

# Local variables for ALBs and Target Groups
locals {
  alb_configs = {
    "AppNameDashboard-PROD-ALB" = {
      alb_name       = "AppNameDashboard-PROD-ALB"
      alb_arn_suffix = "app/AppNameDashboard-PROD-ALB/f93ab370be314618"
      target_groups = {
        "AppNameDashboard-Prod-Primary-TG" = {
          tg_name       = "AppNameDashboard-Prod-Primary-TG"
          tg_arn_suffix = "targetgroup/AppNameDashboard-Prod-Primary-TG/fb9ae754143996ce"
        }
      }
    }
    "AppName-PROD-ALB" = {
      alb_name       = "AppName-PROD-ALB"
      alb_arn_suffix = "app/AppName-PROD-ALB/848a3cd3bb37bffd"
      target_groups = {
        "AppName-PROD-Primary-TG" = {
          tg_name       = "AppName-PROD-Primary-TG"
          tg_arn_suffix = "targetgroup/AppName-PROD-Primary-TG/104dd21318466571"
        }
      }
    }
    "AppName-PROD-ALB" = {
      alb_name       = "AppName-PROD-ALB"
      alb_arn_suffix = "app/AppName-PROD-ALB/264b7bbd12acfba7"
      target_groups = {
        "AppName-PROD-TG-Primary" = {
          tg_name       = "AppName-PROD-TG-Primary"
          tg_arn_suffix = "targetgroup/AppName-PROD-TG-Primary/64453119eb44f29e"
        }
      }
    }
    "AppName-elb" = {
      alb_name       = "AppName-elb"
      alb_arn_suffix = "app/AppName-elb/35d47d181f692f3d"
      target_groups = {
        "AppName-target-grp" = {
          tg_name       = "AppName-target-grp"
          tg_arn_suffix = "targetgroup/AppName-target-grp/83efd5484608930d"
        }
      }
    }
  }

  # Flatten structure for target group alarms
  target_groups_flat = merge([
    for alb_key, alb in local.alb_configs : {
      for tg_key, tg in alb.target_groups : "${alb_key}_${tg_key}" => {
        alb_name       = alb.alb_name
        alb_arn_suffix = alb.alb_arn_suffix
        tg_name        = tg.tg_name
        tg_arn_suffix  = tg.tg_arn_suffix
      }
    }
  ]...)

  # Alarm thresholds - Set higher to reduce noise and only alert on real issues
  tg_4xx_threshold                  = 100 # Number of 4xx errors at target group level (Sum over 5 mins)
  tg_5xx_threshold                  = 50  # Number of 5xx errors at target group level (Sum over 5 mins)
  alb_response_time_threshold       = 5   # Response time in seconds at ALB level (Average)
  tg_response_time_threshold        = 5   # Response time in seconds at target group level (Average)
  alb_rejected_connection_threshold = 25  # Number of rejected connections (Sum over 5 mins)
}

# ============================================================================
# ALB-LEVEL ALARMS
# ============================================================================

# CloudWatch Alarm for ALB Target Response Time
resource "aws_cloudwatch_metric_alarm" "alb_target_response_time" {
  for_each = local.alb_configs

  alarm_name          = "CW-ALB-${each.value.alb_name}-TargetResponseTime"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "TargetResponseTime"
  namespace           = "AWS/ApplicationELB"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.alb_response_time_threshold
  alarm_description   = "This metric monitors target response time for ${each.value.alb_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    LoadBalancer = each.value.alb_arn_suffix
  }

  tags = {
    Name        = "CW-ALB-${each.value.alb_name}-TargetResponseTime"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# CloudWatch Alarm for ALB Rejected Connection Count
resource "aws_cloudwatch_metric_alarm" "alb_rejected_connections" {
  for_each = local.alb_configs

  alarm_name          = "CW-ALB-${each.value.alb_name}-RejectedConnectionCount"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "RejectedConnectionCount"
  namespace           = "AWS/ApplicationELB"
  period              = "300" # 5 minutes
  statistic           = "Sum"
  threshold           = local.alb_rejected_connection_threshold
  alarm_description   = "This metric monitors rejected connections for ${each.value.alb_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    LoadBalancer = each.value.alb_arn_suffix
  }

  tags = {
    Name        = "CW-ALB-${each.value.alb_name}-RejectedConnectionCount"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# ============================================================================
# TARGET GROUP-LEVEL ALARMS
# ============================================================================

# CloudWatch Alarm for Target Group HTTP 4xx Errors
resource "aws_cloudwatch_metric_alarm" "tg_4xx_errors" {
  for_each = local.target_groups_flat

  alarm_name          = "CW-TG-${each.value.tg_name}-HTTPCode4XX"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "HTTPCode_Target_4XX_Count"
  namespace           = "AWS/ApplicationELB"
  period              = "300" # 5 minutes
  statistic           = "Sum"
  threshold           = local.tg_4xx_threshold
  alarm_description   = "This metric monitors 4xx errors from target group ${each.value.tg_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    TargetGroup  = each.value.tg_arn_suffix
    LoadBalancer = each.value.alb_arn_suffix
  }

  tags = {
    Name        = "CW-TG-${each.value.tg_name}-HTTPCode4XX"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# CloudWatch Alarm for Target Group HTTP 5xx Errors
resource "aws_cloudwatch_metric_alarm" "tg_5xx_errors" {
  for_each = local.target_groups_flat

  alarm_name          = "CW-TG-${each.value.tg_name}-HTTPCode5XX"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "HTTPCode_Target_5XX_Count"
  namespace           = "AWS/ApplicationELB"
  period              = "300" # 5 minutes
  statistic           = "Sum"
  threshold           = local.tg_5xx_threshold
  alarm_description   = "This metric monitors 5xx errors from target group ${each.value.tg_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    TargetGroup  = each.value.tg_arn_suffix
    LoadBalancer = each.value.alb_arn_suffix
  }

  tags = {
    Name        = "CW-TG-${each.value.tg_name}-HTTPCode5XX"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# CloudWatch Alarm for Target Group Response Time
resource "aws_cloudwatch_metric_alarm" "tg_response_time" {
  for_each = local.target_groups_flat

  alarm_name          = "CW-TG-${each.value.tg_name}-TargetResponseTime"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "TargetResponseTime"
  namespace           = "AWS/ApplicationELB"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.tg_response_time_threshold
  alarm_description   = "This metric monitors target response time for target group ${each.value.tg_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    TargetGroup  = each.value.tg_arn_suffix
    LoadBalancer = each.value.alb_arn_suffix
  }

  tags = {
    Name        = "CW-TG-${each.value.tg_name}-TargetResponseTime"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# CloudWatch Alarm for Target Group - All Targets Unhealthy (No Healthy Hosts)
resource "aws_cloudwatch_metric_alarm" "tg_all_unhealthy" {
  for_each = local.target_groups_flat

  alarm_name          = "CW-TG-${each.value.tg_name}-AllTargetsUnhealthy"
  comparison_operator = "LessThanOrEqualToThreshold"
  evaluation_periods  = "2"
  metric_name         = "HealthyHostCount"
  namespace           = "AWS/ApplicationELB"
  period              = "300" # 5 minutes
  statistic           = "Minimum"
  threshold           = 0 # Alert if minimum healthy hosts is 0
  alarm_description   = "This metric alerts when all targets are unhealthy in target group ${each.value.tg_name}. Uses Minimum statistic to detect any moment with zero healthy hosts."
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "breaching"

  dimensions = {
    TargetGroup  = each.value.tg_arn_suffix
    LoadBalancer = each.value.alb_arn_suffix
  }

  tags = {
    Name        = "CW-TG-${each.value.tg_name}-AllTargetsUnhealthy"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# ============================================================================
# OUTPUTS
# ============================================================================

<h1 id="description-arns-of-alb-level-cloud-watch-alarms-value-response_time-for-k-v-in-aws_cloudwatch_metric_alarm-alb_target_response_time-k-v-arn">
output "alb_alarm_arns"
</h1>
#     rejected_connections = { for k, v in aws_cloudwatch_metric_alarm.alb_rejected_connections : k => v.arn }
#   }
# }

<h1 id="description-arns-of-target-group-level-cloud-watch-alarms-value-http_4xx_errors-for-k-v-in-aws_cloudwatch_metric_alarm-tg_4xx_errors-k-v-arn">
output "target_group_alarm_arns"
</h1>
#     http_5xx_errors       = { for k, v in aws_cloudwatch_metric_alarm.tg_5xx_errors : k => v.arn }
#     response_time         = { for k, v in aws_cloudwatch_metric_alarm.tg_response_time : k => v.arn }
#     all_targets_unhealthy = { for k, v in aws_cloudwatch_metric_alarm.tg_all_unhealthy : k => v.arn }
#   }
# }