locals { monitoring_metric_prefix = replace(local.name_prefix, "-", "_") bot_error_metrics = { telegram_bot_error = { event = "telegram.bot_error" metric_name = "${local.monitoring_metric_prefix}_telegram_bot_error" display_name = "${local.name_prefix} Telegram bot error" } payment_ingest_failed = { event = "payment.ingest_failed" metric_name = "${local.monitoring_metric_prefix}_payment_ingest_failed" display_name = "${local.name_prefix} payment ingest failed" } purchase_ingest_failed = { event = "purchase.ingest_failed" metric_name = "${local.monitoring_metric_prefix}_purchase_ingest_failed" display_name = "${local.name_prefix} purchase ingest failed" } assistant_reply_failed = { event = "assistant.reply_failed" metric_name = "${local.monitoring_metric_prefix}_assistant_reply_failed" display_name = "${local.name_prefix} assistant reply failed" } reminder_dispatch_failed = { event = "scheduler.reminder.dispatch_failed" metric_name = "${local.monitoring_metric_prefix}_scheduler_reminder_dispatch_failed" display_name = "${local.name_prefix} reminder dispatch failed" } } } resource "google_logging_project_exclusion" "noise" { name = "${local.name_prefix}-log-exclusion" project = var.project_id description = "Exclude successful health checks and static assets from ingestion" filter = <<-EOT resource.type="cloud_run_revision" httpRequest.status=200 ( httpRequest.userAgent =~ "GoogleHC/.*" OR httpRequest.userAgent =~ "kube-probe/.*" OR httpRequest.requestUrl =~ ".*\\.(js|css|png|jpg|jpeg|ico|svg|woff|woff2|map)$" OR httpRequest.requestUrl =~ ".*/health$" ) EOT } resource "google_logging_project_bucket_config" "default" { project = var.project_id location = "global" bucket_id = "_Default" retention_days = 1 } resource "google_monitoring_notification_channel" "email" { for_each = toset(var.alert_notification_emails) project = var.project_id display_name = "${local.name_prefix} alerts ${each.value}" type = "email" labels = { email_address = each.value } depends_on = [google_project_service.enabled] } # DEV-187: Commented out to save ~$0.47/month on Cloud Monitoring costs # TODO: Re-enable if alerting on specific bot error events becomes necessary # resource "google_logging_metric" "bot_error_events" { # for_each = local.bot_error_metrics # # project = var.project_id # name = each.value.metric_name # description = "Counts `${each.value.event}` log events for ${module.bot_api_service.name}." # filter = <<-EOT # resource.type="cloud_run_revision" # resource.labels.service_name="${module.bot_api_service.name}" # jsonPayload.event="${each.value.event}" # EOT # # metric_descriptor { # metric_kind = "DELTA" # value_type = "INT64" # unit = "1" # } # # depends_on = [google_project_service.enabled] # } resource "google_monitoring_alert_policy" "bot_api_5xx" { project = var.project_id display_name = "${local.name_prefix} bot API 5xx" combiner = "OR" notification_channels = [ for channel in google_monitoring_notification_channel.email : channel.name ] documentation { content = "Cloud Run is returning 5xx responses for `${module.bot_api_service.name}` in `${var.environment}`." mime_type = "text/markdown" } conditions { display_name = "Cloud Run 5xx responses" condition_threshold { filter = <<-EOT resource.type="cloud_run_revision" resource.labels.service_name="${module.bot_api_service.name}" metric.type="run.googleapis.com/request_count" metric.labels.response_code_class="5xx" EOT comparison = "COMPARISON_GT" threshold_value = 0 duration = "0s" aggregations { alignment_period = "300s" per_series_aligner = "ALIGN_SUM" cross_series_reducer = "REDUCE_SUM" group_by_fields = ["resource.labels.service_name"] } trigger { count = 1 } } } alert_strategy { auto_close = "1800s" } depends_on = [google_project_service.enabled] } # DEV-187: Commented out to save ~$0.47/month on Cloud Monitoring costs # TODO: Re-enable if alerting on specific bot error events becomes necessary # resource "google_monitoring_alert_policy" "bot_error_events" { # for_each = local.bot_error_metrics # # project = var.project_id # display_name = each.value.display_name # combiner = "OR" # # notification_channels = [ # for channel in google_monitoring_notification_channel.email : channel.name # ] # # documentation { # content = "Structured bot failure event `${each.value.event}` was logged by `${module.bot_api_service.name}` in `${var.environment}`." # mime_type = "text/markdown" # } # # conditions { # display_name = each.value.display_name # # condition_threshold { # filter = <<-EOT # resource.type="cloud_run_revision" # resource.labels.service_name="${module.bot_api_service.name}" # metric.type="logging.googleapis.com/user/${google_logging_metric.bot_error_events[each.key].name}" # EOT # # comparison = "COMPARISON_GT" # threshold_value = 0 # duration = "0s" # # aggregations { # alignment_period = "300s" # per_series_aligner = "ALIGN_RATE" # } # # trigger { # count = 1 # } # } # } # # alert_strategy { # auto_close = "1800s" # } # # depends_on = [google_logging_metric.bot_error_events] # }