目次
Terraform 公式サイト:datadog_monitor | Resources | DataDog/datadog | Terraform | Terraform Registry
resource "datadog_monitor" "foo" {
name = "Name for monitor foo"
type = "metric alert"
message = "Monitor triggered. Notify: @hipchat-channel"
escalation_message = "Escalation message @pagerduty"
query = "avg(last_1h):avg:aws.ec2.cpu{environment:foo,host:foo} by {host} > 4"
monitor_thresholds {
warning = 2
critical = 4
}
include_tags = true
tags = ["foo:bar", "team:fooBar"]
}
TerraformでDatadogのMonitorを書く #Terraform - Qiita
#monitor
resource "datadog_monitor" "keepalive" {
name = "[${var.project_name}][EC2] Linux: 死活監視 "
type = "service check"
message = var.message
query = "\"datadog.agent.up\".over(\"datadog:enabled\",\"${var.project_account}\").by(\"host\").last(2).count_by_status()"
monitor_thresholds {
ok = 1
warning = 1
critical = 1
}
notify_no_data = true
no_data_timeframe = 2
new_host_delay = 300
renotify_interval = 0
timeout_h = 0
include_tags = true
notify_audit = false
tags = ["service:${var.project_name}"]
}
TerraformでDatadogのモニタリング監視を構築・改善した話
resource "datadog_monitor" "container_web_cpu_utilization" {
evaluation_delay = 0
include_tags = true
message = <<-EOT
{{#is_match "ecs_container_name.name" "prod"}}
### アラート内容
{{#is_warning}}
- {{ecs_container_name}} のCPU使用率80%超過
{{/is_warning}}
{{#is_alert}}
- {{ecs_container_name}} のCPU使用率90%超過
{{/is_alert}}
### 対応
- AWSコンソールよりECSの{{ecs_container_name}}コンテナの状態を確認し、CPU使用率上昇の原因を特定すること
https://ap-northeast-1.console.aws.amazon.com/ecs/v2/clusters/syukatsu-kaigi-jp/tasks?region=ap-northeast-1
### 通知先
@slack-production-channel
{{/is_match}}
{{#is_match "ecs_container_name.name" "stg"}}
### アラート内容
{{#is_warning}}
- {{ecs_container_name}} のCPU使用率80%超過
{{/is_warning}}
{{#is_alert}}
- {{ecs_container_name}} のCPU使用率90%超過
{{/is_alert}}
### 対応
- AWSコンソールよりECSの{{ecs_container_name}}コンテナの状態を確認し、CPU使用率上昇の原因を特定すること
https://us-west-2.console.aws.amazon.com/ecs/v2/clusters/syukatsu-kaigi-stg-jp/tasks?region=us-west-2
### 通知先
@slack-staging-channel
{{/is_match}}
EOT
name = "CPU監視 {{ecs_container_name}}"
new_group_delay = 0
no_data_timeframe = 0
notify_audit = false
notify_by = []
notify_no_data = false
priority = 0
query = "avg(last_30m):(avg:ecs.fargate.cpu.usage{ecs_container_name:*-container} by {ecs_container_name} / avg:ecs.fargate.cpu.limit{ecs_container_name:*-container} by {ecs_container_name}) * 0.0001 > 90"
renotify_interval = 20
renotify_occurrences = 0
require_full_window = true
tags = [
"service:<service_name>"
]
timeout_h = 0
type = "query alert"
monitor_thresholds {
critical = "90"
critical_recovery = "60"
warning = "80"
warning_recovery = "50"
}
}
DatadogのモニタをTerraformで管理してみる #Terraform - Qiita
resource "datadog_monitor" "rs_cpu_usage" {
name = "[${var.environment}] CPU Usage is very high on {{host.name}}"
type = "metric alert"
message = "${var.notifies1} ${var.notifies2} ${var.notifies3}"
query = "avg(last_5m):avg:azure.vm.percentage_cpu{type:microsoft.compute/virtualmachines} > 90"
thresholds {
"%" = "${var.detect_alert_count1}"
warning = "80.0"
critical = "90.0"
}
include_tags = "false"
no_data_timeframe = "10"
notify_no_data = "true"
require_full_window = "false"
}
[Datadog][Terraform]MonitorでMetricsのアラート設定をする – ADACHIN SERVER WIKI
ecs_alert.tf
resource "datadog_monitor" "ecs_cpu_alert" {
name = "ecs_cpu_alert"
type = "metric alert"
query = "avg(last_5m):avg:aws.ecs.service.cpuutilization{clustername:hoge-${var.environment}} by {clustername} > 80"
escalation_message = "ECS/Fargate CPU usage has exceeded 80%"
notify_no_data = false
notify_audit = false
timeout_h = 1
include_tags = true
monitor_thresholds {
warning = 40
critical = 80
}
message = <<-EOT
@slack-alert-hoge
{{#is_alert}} @slack-alert-hoge {{/is_alert}}
{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
EOT
tags = [
"product:hoge",
"service:hoge",
"env:${var.environment}"
]
}
resource "datadog_monitor" "ecs_memory_alert" {
name = "ecs_memory_alert"
type = "metric alert"
query = "avg(last_5m):avg:aws.ecs.service.memory_utilization{clustername:hoge-${var.environment}} by {servicename} > 80"
escalation_message = "ECS/Fargate Memory usage has exceeded 80%"
notify_no_data = false
notify_audit = false
timeout_h = 1
include_tags = true
monitor_thresholds {
warning = 70
critical = 80
}
message = <<-EOT
@slack-alert-hoge
{{#is_alert}} @slack-alert-hoge {{/is_alert}}
{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
EOT
tags = [
"product:hoge",
"service:hoge",
"env:${var.environment}"
]
}
event_log_alert.tf
resource "datadog_monitor" "event_log_alert" {
name = "event_log_alert"
type = "event-v2 alert"
query = "events(\"status:(error OR warn OR failed) AND hoge\").rollup(\"count\").last(\"5m\") > 0"
notify_no_data = false
notify_audit = false
timeout_h = 1
monitor_thresholds {
critical = 1
}
message = <<-EOT
@slack-alert-hoge
{{#is_alert}} @slack-alert-hoge {{/is_alert}}
{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
EOT
tags = [
"product:hoge",
"service:hoge",
"env:${var.environment}"
]
}
cloudfront_5xx_error_rate_alert.tf
resource "datadog_monitor" "cloudfront_5xx_error_rate_alert" {
name = "[hoge]cloudfront_5xx_error_rate_alert"
type = "metric alert"
query = "avg(last_5m):avg:aws.cloudfront.5xx_error_rate{distributionid:hoge OR distributionid:hoge OR distributionid:hoge} by {distributionid,aws_account} > 50"
escalation_message = "CloudFront distributions have more than 50% 5xx error rate"
notify_no_data = false
notify_audit = false
timeout_h = 1
include_tags = true
monitor_thresholds {
warning = 30
critical = 50
}
message = <<-EOT
@slack-alert-hoge
{{#is_alert}} @slack-alert-hoge {{/is_alert}}
{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}
EOT
tags = [
"product:hoge",
"service:hoge",
"env:${var.environment}"
]
}
一般向けサイト
ITエンジニア向けサイト
英語サイト
Portfolio
Copyright (c) 2025 システムパフォーマンス入門 All Rights Reserved.