Skip to main content
# ============================================================================
# AWS EC2 CloudWatch Alarms
# ============================================================================

# Local variables for EC2 instances
locals {
  ec2_instances = {
    "AppName-elk-kibanastack" = {
      instance_id   = "i-01771e43eade97da4"
      instance_type = "t4g.xlarge"
      name          = "AppName-elk-KibanaStack-ARM"
    }
    "jenkins-master" = {
      instance_id   = "i-0e142f5cb34d7b9ef"
      instance_type = "t4g.medium"
      name          = "JenkinsMaster-ARM"
    }
    "pritunl-vpn" = {
      instance_id   = "i-075778cbe0d88f72a"
      instance_type = "t3.small"
      name          = "PritunlVPN"
    }
    "AppNamedashboard-sftp" = {
      instance_id   = "i-016e4be403e68c6f0"
      instance_type = "t4g.micro"
      name          = "AppNameDashboard-SFTP"
    }
  }

  # Alarm thresholds for EC2
  ec2_cpu_threshold         = 80        # CPU utilization percentage
  ec2_memory_threshold      = 80        # Memory usage percentage (from CloudWatch Agent)
  ec2_disk_threshold        = 80        # Disk usage percentage (from CloudWatch Agent)
  ec2_status_check_failed   = 1         # Status check failures
  ec2_network_in_threshold  = 100000000 # 100 MB in bytes
  ec2_network_out_threshold = 100000000 # 100 MB in bytes
}

# ============================================================================
# EC2 ALARMS - CPU UTILIZATION
# ============================================================================

resource "aws_cloudwatch_metric_alarm" "ec2_cpu" {
  for_each = local.ec2_instances

  alarm_name          = "CW-EC2-${each.value.name}-CPUUtilization"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.ec2_cpu_threshold
  alarm_description   = "This metric monitors CPU utilization for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    InstanceId = each.value.instance_id
  }

  tags = {
    Name        = "CW-EC2-${each.value.name}-CPUUtilization"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# ============================================================================
# EC2 ALARMS - STATUS CHECKS
# ============================================================================

resource "aws_cloudwatch_metric_alarm" "ec2_status_check_instance" {
  for_each = local.ec2_instances

  alarm_name          = "CW-EC2-${each.value.name}-StatusCheckFailed-Instance"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "2"
  metric_name         = "StatusCheckFailed_Instance"
  namespace           = "AWS/EC2"
  period              = "60" # 1 minute
  statistic           = "Maximum"
  threshold           = local.ec2_status_check_failed
  alarm_description   = "This metric monitors instance status check failures for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms immediately on failure"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    InstanceId = each.value.instance_id
  }

  tags = {
    Name        = "CW-EC2-${each.value.name}-StatusCheckFailed-Instance"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "ec2_status_check_system" {
  for_each = local.ec2_instances

  alarm_name          = "CW-EC2-${each.value.name}-StatusCheckFailed-System"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "2"
  metric_name         = "StatusCheckFailed_System"
  namespace           = "AWS/EC2"
  period              = "60" # 1 minute
  statistic           = "Maximum"
  threshold           = local.ec2_status_check_failed
  alarm_description   = "This metric monitors system status check failures for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms immediately on failure"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    InstanceId = each.value.instance_id
  }

  tags = {
    Name        = "CW-EC2-${each.value.name}-StatusCheckFailed-System"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# ============================================================================
# EC2 ALARMS - MEMORY USAGE (CloudWatch Agent)
# ============================================================================

resource "aws_cloudwatch_metric_alarm" "ec2_memory" {
  for_each = local.ec2_instances

  alarm_name          = "CW-EC2-${each.value.name}-MemoryUsage"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "mem_used_percent"
  namespace           = "CWAgent"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.ec2_memory_threshold
  alarm_description   = "This metric monitors memory usage for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    InstanceId = each.value.instance_id
  }

  tags = {
    Name        = "CW-EC2-${each.value.name}-MemoryUsage"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# ============================================================================
# EC2 ALARMS - DISK USAGE (CloudWatch Agent)
# ============================================================================

resource "aws_cloudwatch_metric_alarm" "ec2_disk" {
  for_each = local.ec2_instances

  alarm_name          = "CW-EC2-${each.value.name}-DiskUsage"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "disk_used_percent"
  namespace           = "CWAgent"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.ec2_disk_threshold
  alarm_description   = "This metric monitors root disk usage for EC2 instance ${each.value.name} (${each.value.instance_id}). Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    InstanceId = each.value.instance_id
    path       = "/"
    device     = "nvme0n1p1"
    fstype     = "ext4"
  }

  tags = {
    Name        = "CW-EC2-${each.value.name}-DiskUsage"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}