Skip to main content
# ============================================================================
# AWS ElastiCache Redis CloudWatch Alarms
# ============================================================================

locals {
  redis_clusters = {
    "AppName-prod-redis-db" = {
      replication_group_id = "AppName-prod-redis-db"
      node_type            = "cache.t4g.small"
    }
    "AppNamedashboard-prod-vpc-redis" = {
      replication_group_id = "AppNamedashboard-prod-vpc-redis"
      node_type            = "cache.t4g.micro"
    }
  }

  # Alarm thresholds for Redis
  redis_cpu_threshold        = 75       # CPU utilization percentage
  redis_memory_threshold     = 80       # Memory usage percentage (DatabaseMemoryUsagePercentage)
  redis_evictions_threshold  = 100      # Number of evictions
  redis_swap_usage_threshold = 52428800 # 50 MB in bytes
}

# ============================================================================
# REDIS ALARMS
# ============================================================================

resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
  for_each = local.redis_clusters

  alarm_name          = "CW-REDIS-${each.value.replication_group_id}-CPUUtilization"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "EngineCPUUtilization"
  namespace           = "AWS/ElastiCache"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.redis_cpu_threshold
  alarm_description   = "This metric monitors CPU utilization for Redis cluster ${each.value.replication_group_id}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    ReplicationGroupId = each.value.replication_group_id
  }

  tags = {
    Name        = "CW-REDIS-${each.value.replication_group_id}-CPUUtilization"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "redis_memory" {
  for_each = local.redis_clusters

  alarm_name          = "CW-REDIS-${each.value.replication_group_id}-MemoryUsage"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "DatabaseMemoryUsagePercentage"
  namespace           = "AWS/ElastiCache"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.redis_memory_threshold
  alarm_description   = "This metric monitors memory usage for Redis cluster ${each.value.replication_group_id}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    ReplicationGroupId = each.value.replication_group_id
  }

  tags = {
    Name        = "CW-REDIS-${each.value.replication_group_id}-MemoryUsage"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "redis_evictions" {
  for_each = local.redis_clusters

  alarm_name          = "CW-REDIS-${each.value.replication_group_id}-Evictions"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "Evictions"
  namespace           = "AWS/ElastiCache"
  period              = "300" # 5 minutes
  statistic           = "Sum"
  threshold           = local.redis_evictions_threshold
  alarm_description   = "This metric monitors evictions for Redis cluster ${each.value.replication_group_id}."
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    ReplicationGroupId = each.value.replication_group_id
  }

  tags = {
    Name        = "CW-REDIS-${each.value.replication_group_id}-Evictions"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "redis_swap_usage" {
  for_each = local.redis_clusters

  alarm_name          = "CW-REDIS-${each.value.replication_group_id}-SwapUsage"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "SwapUsage"
  namespace           = "AWS/ElastiCache"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.redis_swap_usage_threshold
  alarm_description   = "This metric monitors swap usage for Redis cluster ${each.value.replication_group_id}. Alarms when swap usage exceeds 50 MB"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    ReplicationGroupId = each.value.replication_group_id
  }

  tags = {
    Name        = "CW-REDIS-${each.value.replication_group_id}-SwapUsage"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "redis_connections" {
  for_each = local.redis_clusters

  alarm_name          = "CW-REDIS-${each.value.replication_group_id}-CurrentConnections"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "CurrConnections"
  namespace           = "AWS/ElastiCache"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = 5000 # Adjust based on your needs
  alarm_description   = "This metric monitors current connections for Redis cluster ${each.value.replication_group_id}"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    ReplicationGroupId = each.value.replication_group_id
  }

  tags = {
    Name        = "CW-REDIS-${each.value.replication_group_id}-CurrentConnections"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}