From 643ca9396737373cc540f2ec5a5e9f3ba3dcd684 Mon Sep 17 00:00:00 2001 From: whekin Date: Wed, 11 Mar 2026 15:26:08 +0400 Subject: [PATCH] feat(infra): add terraform bot alerting baseline --- infra/terraform/README.md | 18 +++ infra/terraform/locals.tf | 2 + infra/terraform/monitoring.tf | 162 +++++++++++++++++++++++ infra/terraform/terraform.tfvars.example | 3 + infra/terraform/variables.tf | 6 + 5 files changed, 191 insertions(+) create mode 100644 infra/terraform/monitoring.tf diff --git a/infra/terraform/README.md b/infra/terraform/README.md index 62df321..dd8e819 100644 --- a/infra/terraform/README.md +++ b/infra/terraform/README.md @@ -83,6 +83,24 @@ Recommended approach: `bot_assistant_rate_limit_rolling`, `bot_assistant_rate_limit_rolling_window_ms` - optional `bot_mini_app_allowed_origins` + - optional `alert_notification_emails` + +## Alerting baseline + +Terraform can also provision a minimal monitoring baseline for the bot: + +- email notification channels from `alert_notification_emails` +- log-based metrics for: + - `telegram.bot_error` + - `payment.ingest_failed` + - `purchase.ingest_failed` + - `assistant.reply_failed` + - `scheduler.reminder.dispatch_failed` +- an alert policy for Cloud Run 5xx responses on the bot API service +- one alert policy per structured bot failure event above + +If you use email channels, Google Cloud will send a one-time confirmation email for each address. +The notification channel will not deliver alerts until that confirmation step is completed. ## CI validation diff --git a/infra/terraform/locals.tf b/infra/terraform/locals.tf index 0fcd60c..67f9142 100644 --- a/infra/terraform/locals.tf +++ b/infra/terraform/locals.tf @@ -40,6 +40,8 @@ locals { "cloudscheduler.googleapis.com", "iam.googleapis.com", "iamcredentials.googleapis.com", + "logging.googleapis.com", + "monitoring.googleapis.com", "run.googleapis.com", "secretmanager.googleapis.com", "sts.googleapis.com" diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf new file mode 100644 index 0000000..df008fc --- /dev/null +++ b/infra/terraform/monitoring.tf @@ -0,0 +1,162 @@ +locals { + monitoring_metric_prefix = replace(local.name_prefix, "-", "_") + + bot_error_metrics = { + telegram_bot_error = { + event = "telegram.bot_error" + metric_name = "${local.monitoring_metric_prefix}_telegram_bot_error" + display_name = "${local.name_prefix} Telegram bot error" + } + payment_ingest_failed = { + event = "payment.ingest_failed" + metric_name = "${local.monitoring_metric_prefix}_payment_ingest_failed" + display_name = "${local.name_prefix} payment ingest failed" + } + purchase_ingest_failed = { + event = "purchase.ingest_failed" + metric_name = "${local.monitoring_metric_prefix}_purchase_ingest_failed" + display_name = "${local.name_prefix} purchase ingest failed" + } + assistant_reply_failed = { + event = "assistant.reply_failed" + metric_name = "${local.monitoring_metric_prefix}_assistant_reply_failed" + display_name = "${local.name_prefix} assistant reply failed" + } + reminder_dispatch_failed = { + event = "scheduler.reminder.dispatch_failed" + metric_name = "${local.monitoring_metric_prefix}_scheduler_reminder_dispatch_failed" + display_name = "${local.name_prefix} reminder dispatch failed" + } + } +} + +resource "google_monitoring_notification_channel" "email" { + for_each = toset(var.alert_notification_emails) + + project = var.project_id + display_name = "${local.name_prefix} alerts ${each.value}" + type = "email" + + labels = { + email_address = each.value + } + + depends_on = [google_project_service.enabled] +} + +resource "google_logging_metric" "bot_error_events" { + for_each = local.bot_error_metrics + + project = var.project_id + name = each.value.metric_name + description = "Counts `${each.value.event}` log events for ${module.bot_api_service.name}." + filter = <<-EOT +resource.type="cloud_run_revision" +resource.labels.service_name="${module.bot_api_service.name}" +jsonPayload.event="${each.value.event}" + EOT + + metric_descriptor { + metric_kind = "DELTA" + value_type = "INT64" + unit = "1" + } + + depends_on = [google_project_service.enabled] +} + +resource "google_monitoring_alert_policy" "bot_api_5xx" { + project = var.project_id + display_name = "${local.name_prefix} bot API 5xx" + combiner = "OR" + + notification_channels = [ + for channel in google_monitoring_notification_channel.email : channel.name + ] + + documentation { + content = "Cloud Run is returning 5xx responses for `${module.bot_api_service.name}` in `${var.environment}`." + mime_type = "text/markdown" + } + + conditions { + display_name = "Cloud Run 5xx responses" + + condition_threshold { + filter = <<-EOT +resource.type="cloud_run_revision" +resource.labels.service_name="${module.bot_api_service.name}" +metric.type="run.googleapis.com/request_count" +metric.labels.response_code_class="5xx" + EOT + + comparison = "COMPARISON_GT" + threshold_value = 0 + duration = "0s" + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_SUM" + cross_series_reducer = "REDUCE_SUM" + group_by_fields = ["resource.labels.service_name"] + } + + trigger { + count = 1 + } + } + } + + alert_strategy { + auto_close = "1800s" + } + + depends_on = [google_project_service.enabled] +} + +resource "google_monitoring_alert_policy" "bot_error_events" { + for_each = local.bot_error_metrics + + project = var.project_id + display_name = each.value.display_name + combiner = "OR" + + notification_channels = [ + for channel in google_monitoring_notification_channel.email : channel.name + ] + + documentation { + content = "Structured bot failure event `${each.value.event}` was logged by `${module.bot_api_service.name}` in `${var.environment}`." + mime_type = "text/markdown" + } + + conditions { + display_name = each.value.display_name + + condition_threshold { + filter = <<-EOT +resource.type="global" +metric.type="logging.googleapis.com/user/${google_logging_metric.bot_error_events[each.key].name}" + EOT + + comparison = "COMPARISON_GT" + threshold_value = 0 + duration = "0s" + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_RATE" + } + + trigger { + count = 1 + } + } + } + + alert_strategy { + auto_close = "1800s" + } + + depends_on = [google_logging_metric.bot_error_events] +} diff --git a/infra/terraform/terraform.tfvars.example b/infra/terraform/terraform.tfvars.example index a256023..c892565 100644 --- a/infra/terraform/terraform.tfvars.example +++ b/infra/terraform/terraform.tfvars.example @@ -23,6 +23,9 @@ bot_assistant_rate_limit_rolling_window_ms = 86400000 bot_mini_app_allowed_origins = [ "https://household-dev-mini-app-abc123-ew.a.run.app" ] +alert_notification_emails = [ + "alerts@example.com" +] scheduler_utilities_cron = "0 9 * * *" scheduler_rent_warning_cron = "0 9 * * *" diff --git a/infra/terraform/variables.tf b/infra/terraform/variables.tf index 24a5b63..6814a80 100644 --- a/infra/terraform/variables.tf +++ b/infra/terraform/variables.tf @@ -138,6 +138,12 @@ variable "bot_mini_app_allowed_origins" { default = [] } +variable "alert_notification_emails" { + description = "Email addresses that should receive bot monitoring alerts" + type = list(string) + default = [] +} + variable "openai_api_key_secret_id" { description = "Optional Secret Manager ID for OPENAI_API_KEY" type = string