Copy
# ============================================================================
# AWS ElastiCache Redis CloudWatch Alarms
# ============================================================================
locals {
redis_clusters = {
"AppName-prod-redis-db" = {
replication_group_id = "AppName-prod-redis-db"
node_type = "cache.t4g.small"
}
"AppNamedashboard-prod-vpc-redis" = {
replication_group_id = "AppNamedashboard-prod-vpc-redis"
node_type = "cache.t4g.micro"
}
}
# Alarm thresholds for Redis
redis_cpu_threshold = 75 # CPU utilization percentage
redis_memory_threshold = 80 # Memory usage percentage (DatabaseMemoryUsagePercentage)
redis_evictions_threshold = 100 # Number of evictions
redis_swap_usage_threshold = 52428800 # 50 MB in bytes
}
# ============================================================================
# REDIS ALARMS
# ============================================================================
resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
for_each = local.redis_clusters
alarm_name = "CW-REDIS-${each.value.replication_group_id}-CPUUtilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "EngineCPUUtilization"
namespace = "AWS/ElastiCache"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.redis_cpu_threshold
alarm_description = "This metric monitors CPU utilization for Redis cluster ${each.value.replication_group_id}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
ReplicationGroupId = each.value.replication_group_id
}
tags = {
Name = "CW-REDIS-${each.value.replication_group_id}-CPUUtilization"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "redis_memory" {
for_each = local.redis_clusters
alarm_name = "CW-REDIS-${each.value.replication_group_id}-MemoryUsage"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "DatabaseMemoryUsagePercentage"
namespace = "AWS/ElastiCache"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.redis_memory_threshold
alarm_description = "This metric monitors memory usage for Redis cluster ${each.value.replication_group_id}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
ReplicationGroupId = each.value.replication_group_id
}
tags = {
Name = "CW-REDIS-${each.value.replication_group_id}-MemoryUsage"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "redis_evictions" {
for_each = local.redis_clusters
alarm_name = "CW-REDIS-${each.value.replication_group_id}-Evictions"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "Evictions"
namespace = "AWS/ElastiCache"
period = "300" # 5 minutes
statistic = "Sum"
threshold = local.redis_evictions_threshold
alarm_description = "This metric monitors evictions for Redis cluster ${each.value.replication_group_id}."
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
ReplicationGroupId = each.value.replication_group_id
}
tags = {
Name = "CW-REDIS-${each.value.replication_group_id}-Evictions"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "redis_swap_usage" {
for_each = local.redis_clusters
alarm_name = "CW-REDIS-${each.value.replication_group_id}-SwapUsage"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "SwapUsage"
namespace = "AWS/ElastiCache"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.redis_swap_usage_threshold
alarm_description = "This metric monitors swap usage for Redis cluster ${each.value.replication_group_id}. Alarms when swap usage exceeds 50 MB"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
ReplicationGroupId = each.value.replication_group_id
}
tags = {
Name = "CW-REDIS-${each.value.replication_group_id}-SwapUsage"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "redis_connections" {
for_each = local.redis_clusters
alarm_name = "CW-REDIS-${each.value.replication_group_id}-CurrentConnections"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CurrConnections"
namespace = "AWS/ElastiCache"
period = "300" # 5 minutes
statistic = "Average"
threshold = 5000 # Adjust based on your needs
alarm_description = "This metric monitors current connections for Redis cluster ${each.value.replication_group_id}"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
ReplicationGroupId = each.value.replication_group_id
}
tags = {
Name = "CW-REDIS-${each.value.replication_group_id}-CurrentConnections"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
