目次

Datadog監視:TerraformでAWSのモニター設定



Terraformの公式サイト

Terraform 公式サイト:datadog_monitor | Resources | DataDog/datadog | Terraform | Terraform Registry

resource "datadog_monitor" "foo" {
  name               = "Name for monitor foo"
  type               = "metric alert"
  message            = "Monitor triggered. Notify: @hipchat-channel"
  escalation_message = "Escalation message @pagerduty"

  query = "avg(last_1h):avg:aws.ec2.cpu{environment:foo,host:foo} by {host} > 4"

  monitor_thresholds {
    warning  = 2
    critical = 4
  }

  include_tags = true

  tags = ["foo:bar", "team:fooBar"]
}




Terraformでの設定例

TerraformでDatadogのMonitorを書く #Terraform - Qiita

#monitor
resource "datadog_monitor" "keepalive" {
  name    = "[${var.project_name}][EC2] Linux: 死活監視 "
  type    = "service check"
  message = var.message
  query   = "\"datadog.agent.up\".over(\"datadog:enabled\",\"${var.project_account}\").by(\"host\").last(2).count_by_status()"
  monitor_thresholds {
    ok       = 1
    warning  = 1
    critical = 1
  }
  notify_no_data    = true
  no_data_timeframe = 2
  new_host_delay    = 300
  renotify_interval = 0
  timeout_h         = 0
  include_tags      = true
  notify_audit      = false
  tags              = ["service:${var.project_name}"]


}



TerraformでDatadogのモニタリング監視を構築・改善した話

resource "datadog_monitor" "container_web_cpu_utilization" {
  evaluation_delay     = 0
  include_tags         = true
  message              = <<-EOT
                            {{#is_match "ecs_container_name.name" "prod"}}
                            ### アラート内容
                            {{#is_warning}}
                            - {{ecs_container_name}} のCPU使用率80%超過
                            {{/is_warning}}
                            {{#is_alert}}
                            - {{ecs_container_name}} のCPU使用率90%超過
                            {{/is_alert}}
                            ### 対応
                            - AWSコンソールよりECSの{{ecs_container_name}}コンテナの状態を確認し、CPU使用率上昇の原因を特定すること
                              https://ap-northeast-1.console.aws.amazon.com/ecs/v2/clusters/syukatsu-kaigi-jp/tasks?region=ap-northeast-1
                            ### 通知先
                            @slack-production-channel
                            {{/is_match}}

                            {{#is_match "ecs_container_name.name" "stg"}}
                            ### アラート内容
                            {{#is_warning}}
                            - {{ecs_container_name}} のCPU使用率80%超過
                            {{/is_warning}}
                            {{#is_alert}}
                            - {{ecs_container_name}} のCPU使用率90%超過
                            {{/is_alert}}
                            ### 対応
                            - AWSコンソールよりECSの{{ecs_container_name}}コンテナの状態を確認し、CPU使用率上昇の原因を特定すること
                              https://us-west-2.console.aws.amazon.com/ecs/v2/clusters/syukatsu-kaigi-stg-jp/tasks?region=us-west-2
                            ### 通知先
                            @slack-staging-channel
                            {{/is_match}}
                          EOT
  name                 = "CPU監視 {{ecs_container_name}}"
  new_group_delay      = 0
  no_data_timeframe    = 0
  notify_audit         = false
  notify_by            = []
  notify_no_data       = false
  priority             = 0
  query                = "avg(last_30m):(avg:ecs.fargate.cpu.usage{ecs_container_name:*-container} by {ecs_container_name} / avg:ecs.fargate.cpu.limit{ecs_container_name:*-container} by {ecs_container_name}) * 0.0001 > 90"
  renotify_interval    = 20
  renotify_occurrences = 0
  require_full_window  = true
  tags = [
    "service:<service_name>"
  ]
  timeout_h = 0
  type      = "query alert"
  monitor_thresholds {
    critical          = "90"
    critical_recovery = "60"
    warning           = "80"
    warning_recovery  = "50"
  }
}



DatadogのモニタをTerraformで管理してみる #Terraform - Qiita

resource "datadog_monitor" "rs_cpu_usage" {
  name = "[${var.environment}] CPU Usage is very high on {{host.name}}"
  type = "metric alert"
  message = "${var.notifies1} ${var.notifies2} ${var.notifies3}"
  query = "avg(last_5m):avg:azure.vm.percentage_cpu{type:microsoft.compute/virtualmachines} > 90"
  thresholds {
    "%" = "${var.detect_alert_count1}"
    warning = "80.0"
    critical = "90.0"
  }
  include_tags = "false"
  no_data_timeframe = "10"
  notify_no_data = "true"
  require_full_window = "false"
}



[Datadog][Terraform]MonitorでMetricsのアラート設定をする – ADACHIN SERVER WIKI
ecs_alert.tf

resource "datadog_monitor" "ecs_cpu_alert" {
  name               = "ecs_cpu_alert"
  type               = "metric alert"
  query              = "avg(last_5m):avg:aws.ecs.service.cpuutilization{clustername:hoge-${var.environment}} by {clustername} > 80"
  escalation_message = "ECS/Fargate CPU usage has exceeded 80%"
  notify_no_data     = false
  notify_audit       = false
  timeout_h          = 1
  include_tags       = true
 
  monitor_thresholds {
    warning  = 40
    critical = 80
  }
 
  message = <<-EOT
    @slack-alert-hoge
    {{#is_alert}} @slack-alert-hoge {{/is_alert}}
    {{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
  EOT
 
  tags = [
    "product:hoge",
    "service:hoge",
    "env:${var.environment}"
  ]
}
 
resource "datadog_monitor" "ecs_memory_alert" {
  name               = "ecs_memory_alert"
  type               = "metric alert"
  query              = "avg(last_5m):avg:aws.ecs.service.memory_utilization{clustername:hoge-${var.environment}} by {servicename} > 80"
  escalation_message = "ECS/Fargate Memory usage has exceeded 80%"
  notify_no_data     = false
  notify_audit       = false
  timeout_h          = 1
  include_tags       = true
 
  monitor_thresholds {
    warning  = 70
    critical = 80
  }
 
  message = <<-EOT
    @slack-alert-hoge
    {{#is_alert}} @slack-alert-hoge {{/is_alert}}
    {{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
  EOT
 
  tags = [
    "product:hoge",
    "service:hoge",
    "env:${var.environment}"
  ]
}

event_log_alert.tf

resource "datadog_monitor" "event_log_alert" {
  name           = "event_log_alert"
  type           = "event-v2 alert"
  query          = "events(\"status:(error OR warn OR failed) AND hoge\").rollup(\"count\").last(\"5m\") > 0"
  notify_no_data = false
  notify_audit   = false
  timeout_h      = 1
 
  monitor_thresholds {
    critical = 1
  }
 
  message = <<-EOT
    @slack-alert-hoge
    {{#is_alert}} @slack-alert-hoge {{/is_alert}}
    {{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
  EOT
 
  tags = [
    "product:hoge",
    "service:hoge",
    "env:${var.environment}"
  ]
}

cloudfront_5xx_error_rate_alert.tf

resource "datadog_monitor" "cloudfront_5xx_error_rate_alert" {
  name               = "[hoge]cloudfront_5xx_error_rate_alert"
  type               = "metric alert"
  query              = "avg(last_5m):avg:aws.cloudfront.5xx_error_rate{distributionid:hoge OR distributionid:hoge OR distributionid:hoge} by {distributionid,aws_account} > 50"
  escalation_message = "CloudFront distributions have more than 50% 5xx error rate"
  notify_no_data     = false
  notify_audit       = false
  timeout_h          = 1
  include_tags       = true
 
  monitor_thresholds {
    warning  = 30
    critical = 50
  }
 
  message = <<-EOT
    @slack-alert-hoge
    {{#is_alert}} @slack-alert-hoge {{/is_alert}}
    {{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
  EOT
 
  tags = [
    "product:hoge",
    "service:hoge",
    "env:${var.environment}"
  ]
}