Skip to main content
# ============================================================================
# AWS OpenSearch CloudWatch Alarms
# ============================================================================

locals {
  opensearch_domains = {
    "AppName-pgdataoss" = {
      domain_name    = "AppName-pgdataoss"
      instance_type  = "m6g.large.search"
      instance_count = 1
    }
  }

  # Alarm thresholds for OpenSearch
  oss_cpu_threshold            = 80    # CPU utilization percentage
  oss_memory_threshold         = 80    # JVM memory pressure percentage
  oss_disk_threshold           = 16384 # Storage space usage in MiB (16 GB out of 20 GB = 80% threshold)
  oss_cluster_status_threshold = 1     # 1 = Red, 0 = Green/Yellow
}

# ============================================================================
# OPENSEARCH ALARMS
# ============================================================================

resource "aws_cloudwatch_metric_alarm" "opensearch_cpu" {
  for_each = local.opensearch_domains

  alarm_name          = "CW-OSS-${each.value.domain_name}-CPUUtilization"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/ES"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.oss_cpu_threshold
  alarm_description   = "This metric monitors CPU utilization for OpenSearch domain ${each.value.domain_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    DomainName = each.value.domain_name
    ClientId   = data.aws_caller_identity.current.account_id
  }

  tags = {
    Name        = "CW-OSS-${each.value.domain_name}-CPUUtilization"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "opensearch_memory" {
  for_each = local.opensearch_domains

  alarm_name          = "CW-OSS-${each.value.domain_name}-JVMMemoryPressure"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "3"
  metric_name         = "JVMMemoryPressure"
  namespace           = "AWS/ES"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.oss_memory_threshold
  alarm_description   = "This metric monitors JVM memory pressure for OpenSearch domain ${each.value.domain_name}. Alarms after 3 consecutive breaches (15 minutes)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    DomainName = each.value.domain_name
    ClientId   = data.aws_caller_identity.current.account_id
  }

  tags = {
    Name        = "CW-OSS-${each.value.domain_name}-JVMMemoryPressure"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "opensearch_storage" {
  for_each = local.opensearch_domains

  alarm_name          = "CW-OSS-${each.value.domain_name}-ClusterUsedSpace"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "ClusterUsedSpace"
  namespace           = "AWS/ES"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = local.oss_disk_threshold
  alarm_description   = "This metric monitors storage space usage for OpenSearch domain ${each.value.domain_name}. Alarms when used space exceeds 16 GB (80% of 20 GB capacity)"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    DomainName = each.value.domain_name
    ClientId   = data.aws_caller_identity.current.account_id
  }

  tags = {
    Name        = "CW-OSS-${each.value.domain_name}-ClusterUsedSpace"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "opensearch_cluster_status" {
  for_each = local.opensearch_domains

  alarm_name          = "CW-OSS-${each.value.domain_name}-ClusterStatusRed"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  metric_name         = "ClusterStatus.red"
  namespace           = "AWS/ES"
  period              = "60" # 1 minute
  statistic           = "Maximum"
  threshold           = local.oss_cluster_status_threshold
  alarm_description   = "This metric monitors if OpenSearch cluster ${each.value.domain_name} is in RED status (critical). Alarms immediately"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    DomainName = each.value.domain_name
    ClientId   = data.aws_caller_identity.current.account_id
  }

  tags = {
    Name        = "CW-OSS-${each.value.domain_name}-ClusterStatusRed"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

resource "aws_cloudwatch_metric_alarm" "opensearch_free_storage" {
  for_each = local.opensearch_domains

  alarm_name          = "CW-OSS-${each.value.domain_name}-FreeStorageSpace"
  comparison_operator = "LessThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "FreeStorageSpace"
  namespace           = "AWS/ES"
  period              = "300" # 5 minutes
  statistic           = "Average"
  threshold           = 10000 # 10 GB in MB
  alarm_description   = "This metric monitors free storage space for OpenSearch domain ${each.value.domain_name}. Alarms when below 10 GB"
  alarm_actions       = [var.sns_topic_arn]
  treat_missing_data  = "notBreaching"

  dimensions = {
    DomainName = each.value.domain_name
    ClientId   = data.aws_caller_identity.current.account_id
  }

  tags = {
    Name        = "CW-OSS-${each.value.domain_name}-FreeStorageSpace"
    Application = "CommonInfraResource"
    Environment = "Production"
    ManagedBy   = "Terraform"
  }
}

# ============================================================================
# DATA SOURCE
# ============================================================================

data "aws_caller_identity" "current" {}