mirror of
https://github.com/whekin/household-bot.git
synced 2026-03-31 17:34:03 +00:00
- Add latest tag push alongside SHA tag for manual rollback/debugging - Reduce log retention from 3 to 1 day - Comment out bot_error_metrics and alerts to save ~$0.47/month - Minor whitespace fix in cd.yml
192 lines
5.6 KiB
HCL
192 lines
5.6 KiB
HCL
locals {
|
|
monitoring_metric_prefix = replace(local.name_prefix, "-", "_")
|
|
|
|
bot_error_metrics = {
|
|
telegram_bot_error = {
|
|
event = "telegram.bot_error"
|
|
metric_name = "${local.monitoring_metric_prefix}_telegram_bot_error"
|
|
display_name = "${local.name_prefix} Telegram bot error"
|
|
}
|
|
payment_ingest_failed = {
|
|
event = "payment.ingest_failed"
|
|
metric_name = "${local.monitoring_metric_prefix}_payment_ingest_failed"
|
|
display_name = "${local.name_prefix} payment ingest failed"
|
|
}
|
|
purchase_ingest_failed = {
|
|
event = "purchase.ingest_failed"
|
|
metric_name = "${local.monitoring_metric_prefix}_purchase_ingest_failed"
|
|
display_name = "${local.name_prefix} purchase ingest failed"
|
|
}
|
|
assistant_reply_failed = {
|
|
event = "assistant.reply_failed"
|
|
metric_name = "${local.monitoring_metric_prefix}_assistant_reply_failed"
|
|
display_name = "${local.name_prefix} assistant reply failed"
|
|
}
|
|
reminder_dispatch_failed = {
|
|
event = "scheduler.reminder.dispatch_failed"
|
|
metric_name = "${local.monitoring_metric_prefix}_scheduler_reminder_dispatch_failed"
|
|
display_name = "${local.name_prefix} reminder dispatch failed"
|
|
}
|
|
}
|
|
}
|
|
|
|
resource "google_logging_project_exclusion" "noise" {
|
|
name = "${local.name_prefix}-log-exclusion"
|
|
project = var.project_id
|
|
description = "Exclude successful health checks and static assets from ingestion"
|
|
|
|
filter = <<-EOT
|
|
resource.type="cloud_run_revision"
|
|
httpRequest.status=200
|
|
(
|
|
httpRequest.userAgent =~ "GoogleHC/.*" OR
|
|
httpRequest.userAgent =~ "kube-probe/.*" OR
|
|
httpRequest.requestUrl =~ ".*\\.(js|css|png|jpg|jpeg|ico|svg|woff|woff2|map)$" OR
|
|
httpRequest.requestUrl =~ ".*/health$"
|
|
)
|
|
EOT
|
|
}
|
|
|
|
resource "google_logging_project_bucket_config" "default" {
|
|
project = var.project_id
|
|
location = "global"
|
|
bucket_id = "_Default"
|
|
retention_days = 1
|
|
}
|
|
|
|
resource "google_monitoring_notification_channel" "email" {
|
|
for_each = toset(var.alert_notification_emails)
|
|
|
|
project = var.project_id
|
|
display_name = "${local.name_prefix} alerts ${each.value}"
|
|
type = "email"
|
|
|
|
labels = {
|
|
email_address = each.value
|
|
}
|
|
|
|
depends_on = [google_project_service.enabled]
|
|
}
|
|
|
|
# DEV-187: Commented out to save ~$0.47/month on Cloud Monitoring costs
|
|
# TODO: Re-enable if alerting on specific bot error events becomes necessary
|
|
# resource "google_logging_metric" "bot_error_events" {
|
|
# for_each = local.bot_error_metrics
|
|
#
|
|
# project = var.project_id
|
|
# name = each.value.metric_name
|
|
# description = "Counts `${each.value.event}` log events for ${module.bot_api_service.name}."
|
|
# filter = <<-EOT
|
|
# resource.type="cloud_run_revision"
|
|
# resource.labels.service_name="${module.bot_api_service.name}"
|
|
# jsonPayload.event="${each.value.event}"
|
|
# EOT
|
|
#
|
|
# metric_descriptor {
|
|
# metric_kind = "DELTA"
|
|
# value_type = "INT64"
|
|
# unit = "1"
|
|
# }
|
|
#
|
|
# depends_on = [google_project_service.enabled]
|
|
# }
|
|
|
|
resource "google_monitoring_alert_policy" "bot_api_5xx" {
|
|
project = var.project_id
|
|
display_name = "${local.name_prefix} bot API 5xx"
|
|
combiner = "OR"
|
|
|
|
notification_channels = [
|
|
for channel in google_monitoring_notification_channel.email : channel.name
|
|
]
|
|
|
|
documentation {
|
|
content = "Cloud Run is returning 5xx responses for `${module.bot_api_service.name}` in `${var.environment}`."
|
|
mime_type = "text/markdown"
|
|
}
|
|
|
|
conditions {
|
|
display_name = "Cloud Run 5xx responses"
|
|
|
|
condition_threshold {
|
|
filter = <<-EOT
|
|
resource.type="cloud_run_revision"
|
|
resource.labels.service_name="${module.bot_api_service.name}"
|
|
metric.type="run.googleapis.com/request_count"
|
|
metric.labels.response_code_class="5xx"
|
|
EOT
|
|
|
|
comparison = "COMPARISON_GT"
|
|
threshold_value = 0
|
|
duration = "0s"
|
|
|
|
aggregations {
|
|
alignment_period = "300s"
|
|
per_series_aligner = "ALIGN_SUM"
|
|
cross_series_reducer = "REDUCE_SUM"
|
|
group_by_fields = ["resource.labels.service_name"]
|
|
}
|
|
|
|
trigger {
|
|
count = 1
|
|
}
|
|
}
|
|
}
|
|
|
|
alert_strategy {
|
|
auto_close = "1800s"
|
|
}
|
|
|
|
depends_on = [google_project_service.enabled]
|
|
}
|
|
|
|
# DEV-187: Commented out to save ~$0.47/month on Cloud Monitoring costs
|
|
# TODO: Re-enable if alerting on specific bot error events becomes necessary
|
|
# resource "google_monitoring_alert_policy" "bot_error_events" {
|
|
# for_each = local.bot_error_metrics
|
|
#
|
|
# project = var.project_id
|
|
# display_name = each.value.display_name
|
|
# combiner = "OR"
|
|
#
|
|
# notification_channels = [
|
|
# for channel in google_monitoring_notification_channel.email : channel.name
|
|
# ]
|
|
#
|
|
# documentation {
|
|
# content = "Structured bot failure event `${each.value.event}` was logged by `${module.bot_api_service.name}` in `${var.environment}`."
|
|
# mime_type = "text/markdown"
|
|
# }
|
|
#
|
|
# conditions {
|
|
# display_name = each.value.display_name
|
|
#
|
|
# condition_threshold {
|
|
# filter = <<-EOT
|
|
# resource.type="cloud_run_revision"
|
|
# resource.labels.service_name="${module.bot_api_service.name}"
|
|
# metric.type="logging.googleapis.com/user/${google_logging_metric.bot_error_events[each.key].name}"
|
|
# EOT
|
|
#
|
|
# comparison = "COMPARISON_GT"
|
|
# threshold_value = 0
|
|
# duration = "0s"
|
|
#
|
|
# aggregations {
|
|
# alignment_period = "300s"
|
|
# per_series_aligner = "ALIGN_RATE"
|
|
# }
|
|
#
|
|
# trigger {
|
|
# count = 1
|
|
# }
|
|
# }
|
|
# }
|
|
#
|
|
# alert_strategy {
|
|
# auto_close = "1800s"
|
|
# }
|
|
#
|
|
# depends_on = [google_logging_metric.bot_error_events]
|
|
# }
|