Copy
# ============================================================================
# AWS OpenSearch CloudWatch Alarms
# ============================================================================
locals {
opensearch_domains = {
"AppName-pgdataoss" = {
domain_name = "AppName-pgdataoss"
instance_type = "m6g.large.search"
instance_count = 1
}
}
# Alarm thresholds for OpenSearch
oss_cpu_threshold = 80 # CPU utilization percentage
oss_memory_threshold = 80 # JVM memory pressure percentage
oss_disk_threshold = 16384 # Storage space usage in MiB (16 GB out of 20 GB = 80% threshold)
oss_cluster_status_threshold = 1 # 1 = Red, 0 = Green/Yellow
}
# ============================================================================
# OPENSEARCH ALARMS
# ============================================================================
resource "aws_cloudwatch_metric_alarm" "opensearch_cpu" {
for_each = local.opensearch_domains
alarm_name = "CW-OSS-${each.value.domain_name}-CPUUtilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "CPUUtilization"
namespace = "AWS/ES"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.oss_cpu_threshold
alarm_description = "This metric monitors CPU utilization for OpenSearch domain ${each.value.domain_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
DomainName = each.value.domain_name
ClientId = data.aws_caller_identity.current.account_id
}
tags = {
Name = "CW-OSS-${each.value.domain_name}-CPUUtilization"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "opensearch_memory" {
for_each = local.opensearch_domains
alarm_name = "CW-OSS-${each.value.domain_name}-JVMMemoryPressure"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = "JVMMemoryPressure"
namespace = "AWS/ES"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.oss_memory_threshold
alarm_description = "This metric monitors JVM memory pressure for OpenSearch domain ${each.value.domain_name}. Alarms after 3 consecutive breaches (15 minutes)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
DomainName = each.value.domain_name
ClientId = data.aws_caller_identity.current.account_id
}
tags = {
Name = "CW-OSS-${each.value.domain_name}-JVMMemoryPressure"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "opensearch_storage" {
for_each = local.opensearch_domains
alarm_name = "CW-OSS-${each.value.domain_name}-ClusterUsedSpace"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "ClusterUsedSpace"
namespace = "AWS/ES"
period = "300" # 5 minutes
statistic = "Average"
threshold = local.oss_disk_threshold
alarm_description = "This metric monitors storage space usage for OpenSearch domain ${each.value.domain_name}. Alarms when used space exceeds 16 GB (80% of 20 GB capacity)"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
DomainName = each.value.domain_name
ClientId = data.aws_caller_identity.current.account_id
}
tags = {
Name = "CW-OSS-${each.value.domain_name}-ClusterUsedSpace"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "opensearch_cluster_status" {
for_each = local.opensearch_domains
alarm_name = "CW-OSS-${each.value.domain_name}-ClusterStatusRed"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "ClusterStatus.red"
namespace = "AWS/ES"
period = "60" # 1 minute
statistic = "Maximum"
threshold = local.oss_cluster_status_threshold
alarm_description = "This metric monitors if OpenSearch cluster ${each.value.domain_name} is in RED status (critical). Alarms immediately"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
DomainName = each.value.domain_name
ClientId = data.aws_caller_identity.current.account_id
}
tags = {
Name = "CW-OSS-${each.value.domain_name}-ClusterStatusRed"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
resource "aws_cloudwatch_metric_alarm" "opensearch_free_storage" {
for_each = local.opensearch_domains
alarm_name = "CW-OSS-${each.value.domain_name}-FreeStorageSpace"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
metric_name = "FreeStorageSpace"
namespace = "AWS/ES"
period = "300" # 5 minutes
statistic = "Average"
threshold = 10000 # 10 GB in MB
alarm_description = "This metric monitors free storage space for OpenSearch domain ${each.value.domain_name}. Alarms when below 10 GB"
alarm_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
DomainName = each.value.domain_name
ClientId = data.aws_caller_identity.current.account_id
}
tags = {
Name = "CW-OSS-${each.value.domain_name}-FreeStorageSpace"
Application = "CommonInfraResource"
Environment = "Production"
ManagedBy = "Terraform"
}
}
# ============================================================================
# DATA SOURCE
# ============================================================================
data "aws_caller_identity" "current" {}
