Copy
# ============================================================================
# AWS EC2 CloudWatch Alarms
# ============================================================================
# Local variables for EC2 instances
locals {
ec2_instances = {
"AppName-elk-kibanastack" = {
instance_id = "i-01771e43eade97da4"
instance_type = "t4g.xlarge"
name = "AppName-elk-KibanaStack-ARM"
}
"jenkins-master" = {
instance_id = "i-0e142f5cb34d7b9ef"
instance_type = "t4g.medium"
name = "JenkinsMaster-ARM"
}
"pritunl-vpn" = {
instance_id = "i-075778cbe0d88f72a"
instance_type = "t3.small"
name = "PritunlVPN"
}
"AppNamedashboard-sftp" = {
instance_id = "i-016e4be403e68c6f0"
instance_type = "t4g.micro"
name = "AppNameDashboard-SFTP"
}
}
# Alarm thresholds for EC2
ec2_cpu_threshold = 80 # CPU utilization percentage
ec2_memory_threshold = 80 # Memory usage percentage (from CloudWatch Agent)
ec2_disk_threshold = 80 # Disk usage percentage (from CloudWatch Agent)
ec2_status_check_failed = 1 # Status check failures
ec2_network_in_threshold = 100000000 # 100 MB in bytes
ec2_network_out_threshold = 100000000 # 100 MB in bytes
}
# ============================================================================
# EC2 ALARMS - CPU UTILIZATION
# ============================================================================
resource "aws_cloudwatch_metric_alarm" "ec2_cpu" {
for_each = local.ec2_instances
alarm_name = "CW-EC2-${each.value.name}-CPUUtilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.ec2_cpu_threshold
alarm_description = "This metric monitors CPU utilization for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = each.value.instance_id
}
tags = {
Name = "CW-EC2-${each.value.name}-CPUUtilization"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# ============================================================================
# EC2 ALARMS - STATUS CHECKS
# ============================================================================
resource "aws_cloudwatch_metric_alarm" "ec2_status_check_instance" {
for_each = local.ec2_instances
alarm_name = "CW-EC2-${each.value.name}-StatusCheckFailed-Instance"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
metric_name = "StatusCheckFailed_Instance"
namespace = "AWS/EC2"
period = "60" # 1 minute
statistic = "Maximum"
threshold = local.ec2_status_check_failed
alarm_description = "This metric monitors instance status check failures for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms immediately on failure"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = each.value.instance_id
}
tags = {
Name = "CW-EC2-${each.value.name}-StatusCheckFailed-Instance"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "ec2_status_check_system" {
for_each = local.ec2_instances
alarm_name = "CW-EC2-${each.value.name}-StatusCheckFailed-System"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
metric_name = "StatusCheckFailed_System"
namespace = "AWS/EC2"
period = "60" # 1 minute
statistic = "Maximum"
threshold = local.ec2_status_check_failed
alarm_description = "This metric monitors system status check failures for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms immediately on failure"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = each.value.instance_id
}
tags = {
Name = "CW-EC2-${each.value.name}-StatusCheckFailed-System"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# ============================================================================
# EC2 ALARMS - MEMORY USAGE (CloudWatch Agent)
# ============================================================================
resource "aws_cloudwatch_metric_alarm" "ec2_memory" {
for_each = local.ec2_instances
alarm_name = "CW-EC2-${each.value.name}-MemoryUsage"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "mem_used_percent"
namespace = "CWAgent"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.ec2_memory_threshold
alarm_description = "This metric monitors memory usage for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = each.value.instance_id
}
tags = {
Name = "CW-EC2-${each.value.name}-MemoryUsage"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# ============================================================================
# EC2 ALARMS - DISK USAGE (CloudWatch Agent)
# ============================================================================
resource "aws_cloudwatch_metric_alarm" "ec2_disk" {
for_each = local.ec2_instances
alarm_name = "CW-EC2-${each.value.name}-DiskUsage"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "disk_used_percent"
namespace = "CWAgent"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.ec2_disk_threshold
alarm_description = "This metric monitors root disk usage for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = each.value.instance_id
path = "/"
device = "nvme0n1p1"
fstype = "ext4"
}
tags = {
Name = "CW-EC2-${each.value.name}-DiskUsage"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
