mirror of
https://github.com/whekin/household-bot.git
synced 2026-03-31 12:04:02 +00:00
feat(infra): add terraform bot alerting baseline
This commit is contained in:
@@ -83,6 +83,24 @@ Recommended approach:
|
|||||||
`bot_assistant_rate_limit_rolling`,
|
`bot_assistant_rate_limit_rolling`,
|
||||||
`bot_assistant_rate_limit_rolling_window_ms`
|
`bot_assistant_rate_limit_rolling_window_ms`
|
||||||
- optional `bot_mini_app_allowed_origins`
|
- optional `bot_mini_app_allowed_origins`
|
||||||
|
- optional `alert_notification_emails`
|
||||||
|
|
||||||
|
## Alerting baseline
|
||||||
|
|
||||||
|
Terraform can also provision a minimal monitoring baseline for the bot:
|
||||||
|
|
||||||
|
- email notification channels from `alert_notification_emails`
|
||||||
|
- log-based metrics for:
|
||||||
|
- `telegram.bot_error`
|
||||||
|
- `payment.ingest_failed`
|
||||||
|
- `purchase.ingest_failed`
|
||||||
|
- `assistant.reply_failed`
|
||||||
|
- `scheduler.reminder.dispatch_failed`
|
||||||
|
- an alert policy for Cloud Run 5xx responses on the bot API service
|
||||||
|
- one alert policy per structured bot failure event above
|
||||||
|
|
||||||
|
If you use email channels, Google Cloud will send a one-time confirmation email for each address.
|
||||||
|
The notification channel will not deliver alerts until that confirmation step is completed.
|
||||||
|
|
||||||
## CI validation
|
## CI validation
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,8 @@ locals {
|
|||||||
"cloudscheduler.googleapis.com",
|
"cloudscheduler.googleapis.com",
|
||||||
"iam.googleapis.com",
|
"iam.googleapis.com",
|
||||||
"iamcredentials.googleapis.com",
|
"iamcredentials.googleapis.com",
|
||||||
|
"logging.googleapis.com",
|
||||||
|
"monitoring.googleapis.com",
|
||||||
"run.googleapis.com",
|
"run.googleapis.com",
|
||||||
"secretmanager.googleapis.com",
|
"secretmanager.googleapis.com",
|
||||||
"sts.googleapis.com"
|
"sts.googleapis.com"
|
||||||
|
|||||||
162
infra/terraform/monitoring.tf
Normal file
162
infra/terraform/monitoring.tf
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
locals {
|
||||||
|
monitoring_metric_prefix = replace(local.name_prefix, "-", "_")
|
||||||
|
|
||||||
|
bot_error_metrics = {
|
||||||
|
telegram_bot_error = {
|
||||||
|
event = "telegram.bot_error"
|
||||||
|
metric_name = "${local.monitoring_metric_prefix}_telegram_bot_error"
|
||||||
|
display_name = "${local.name_prefix} Telegram bot error"
|
||||||
|
}
|
||||||
|
payment_ingest_failed = {
|
||||||
|
event = "payment.ingest_failed"
|
||||||
|
metric_name = "${local.monitoring_metric_prefix}_payment_ingest_failed"
|
||||||
|
display_name = "${local.name_prefix} payment ingest failed"
|
||||||
|
}
|
||||||
|
purchase_ingest_failed = {
|
||||||
|
event = "purchase.ingest_failed"
|
||||||
|
metric_name = "${local.monitoring_metric_prefix}_purchase_ingest_failed"
|
||||||
|
display_name = "${local.name_prefix} purchase ingest failed"
|
||||||
|
}
|
||||||
|
assistant_reply_failed = {
|
||||||
|
event = "assistant.reply_failed"
|
||||||
|
metric_name = "${local.monitoring_metric_prefix}_assistant_reply_failed"
|
||||||
|
display_name = "${local.name_prefix} assistant reply failed"
|
||||||
|
}
|
||||||
|
reminder_dispatch_failed = {
|
||||||
|
event = "scheduler.reminder.dispatch_failed"
|
||||||
|
metric_name = "${local.monitoring_metric_prefix}_scheduler_reminder_dispatch_failed"
|
||||||
|
display_name = "${local.name_prefix} reminder dispatch failed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "google_monitoring_notification_channel" "email" {
|
||||||
|
for_each = toset(var.alert_notification_emails)
|
||||||
|
|
||||||
|
project = var.project_id
|
||||||
|
display_name = "${local.name_prefix} alerts ${each.value}"
|
||||||
|
type = "email"
|
||||||
|
|
||||||
|
labels = {
|
||||||
|
email_address = each.value
|
||||||
|
}
|
||||||
|
|
||||||
|
depends_on = [google_project_service.enabled]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "google_logging_metric" "bot_error_events" {
|
||||||
|
for_each = local.bot_error_metrics
|
||||||
|
|
||||||
|
project = var.project_id
|
||||||
|
name = each.value.metric_name
|
||||||
|
description = "Counts `${each.value.event}` log events for ${module.bot_api_service.name}."
|
||||||
|
filter = <<-EOT
|
||||||
|
resource.type="cloud_run_revision"
|
||||||
|
resource.labels.service_name="${module.bot_api_service.name}"
|
||||||
|
jsonPayload.event="${each.value.event}"
|
||||||
|
EOT
|
||||||
|
|
||||||
|
metric_descriptor {
|
||||||
|
metric_kind = "DELTA"
|
||||||
|
value_type = "INT64"
|
||||||
|
unit = "1"
|
||||||
|
}
|
||||||
|
|
||||||
|
depends_on = [google_project_service.enabled]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "google_monitoring_alert_policy" "bot_api_5xx" {
|
||||||
|
project = var.project_id
|
||||||
|
display_name = "${local.name_prefix} bot API 5xx"
|
||||||
|
combiner = "OR"
|
||||||
|
|
||||||
|
notification_channels = [
|
||||||
|
for channel in google_monitoring_notification_channel.email : channel.name
|
||||||
|
]
|
||||||
|
|
||||||
|
documentation {
|
||||||
|
content = "Cloud Run is returning 5xx responses for `${module.bot_api_service.name}` in `${var.environment}`."
|
||||||
|
mime_type = "text/markdown"
|
||||||
|
}
|
||||||
|
|
||||||
|
conditions {
|
||||||
|
display_name = "Cloud Run 5xx responses"
|
||||||
|
|
||||||
|
condition_threshold {
|
||||||
|
filter = <<-EOT
|
||||||
|
resource.type="cloud_run_revision"
|
||||||
|
resource.labels.service_name="${module.bot_api_service.name}"
|
||||||
|
metric.type="run.googleapis.com/request_count"
|
||||||
|
metric.labels.response_code_class="5xx"
|
||||||
|
EOT
|
||||||
|
|
||||||
|
comparison = "COMPARISON_GT"
|
||||||
|
threshold_value = 0
|
||||||
|
duration = "0s"
|
||||||
|
|
||||||
|
aggregations {
|
||||||
|
alignment_period = "300s"
|
||||||
|
per_series_aligner = "ALIGN_SUM"
|
||||||
|
cross_series_reducer = "REDUCE_SUM"
|
||||||
|
group_by_fields = ["resource.labels.service_name"]
|
||||||
|
}
|
||||||
|
|
||||||
|
trigger {
|
||||||
|
count = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
alert_strategy {
|
||||||
|
auto_close = "1800s"
|
||||||
|
}
|
||||||
|
|
||||||
|
depends_on = [google_project_service.enabled]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "google_monitoring_alert_policy" "bot_error_events" {
|
||||||
|
for_each = local.bot_error_metrics
|
||||||
|
|
||||||
|
project = var.project_id
|
||||||
|
display_name = each.value.display_name
|
||||||
|
combiner = "OR"
|
||||||
|
|
||||||
|
notification_channels = [
|
||||||
|
for channel in google_monitoring_notification_channel.email : channel.name
|
||||||
|
]
|
||||||
|
|
||||||
|
documentation {
|
||||||
|
content = "Structured bot failure event `${each.value.event}` was logged by `${module.bot_api_service.name}` in `${var.environment}`."
|
||||||
|
mime_type = "text/markdown"
|
||||||
|
}
|
||||||
|
|
||||||
|
conditions {
|
||||||
|
display_name = each.value.display_name
|
||||||
|
|
||||||
|
condition_threshold {
|
||||||
|
filter = <<-EOT
|
||||||
|
resource.type="global"
|
||||||
|
metric.type="logging.googleapis.com/user/${google_logging_metric.bot_error_events[each.key].name}"
|
||||||
|
EOT
|
||||||
|
|
||||||
|
comparison = "COMPARISON_GT"
|
||||||
|
threshold_value = 0
|
||||||
|
duration = "0s"
|
||||||
|
|
||||||
|
aggregations {
|
||||||
|
alignment_period = "300s"
|
||||||
|
per_series_aligner = "ALIGN_RATE"
|
||||||
|
}
|
||||||
|
|
||||||
|
trigger {
|
||||||
|
count = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
alert_strategy {
|
||||||
|
auto_close = "1800s"
|
||||||
|
}
|
||||||
|
|
||||||
|
depends_on = [google_logging_metric.bot_error_events]
|
||||||
|
}
|
||||||
@@ -23,6 +23,9 @@ bot_assistant_rate_limit_rolling_window_ms = 86400000
|
|||||||
bot_mini_app_allowed_origins = [
|
bot_mini_app_allowed_origins = [
|
||||||
"https://household-dev-mini-app-abc123-ew.a.run.app"
|
"https://household-dev-mini-app-abc123-ew.a.run.app"
|
||||||
]
|
]
|
||||||
|
alert_notification_emails = [
|
||||||
|
"alerts@example.com"
|
||||||
|
]
|
||||||
|
|
||||||
scheduler_utilities_cron = "0 9 * * *"
|
scheduler_utilities_cron = "0 9 * * *"
|
||||||
scheduler_rent_warning_cron = "0 9 * * *"
|
scheduler_rent_warning_cron = "0 9 * * *"
|
||||||
|
|||||||
@@ -138,6 +138,12 @@ variable "bot_mini_app_allowed_origins" {
|
|||||||
default = []
|
default = []
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "alert_notification_emails" {
|
||||||
|
description = "Email addresses that should receive bot monitoring alerts"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
variable "openai_api_key_secret_id" {
|
variable "openai_api_key_secret_id" {
|
||||||
description = "Optional Secret Manager ID for OPENAI_API_KEY"
|
description = "Optional Secret Manager ID for OPENAI_API_KEY"
|
||||||
type = string
|
type = string
|
||||||
|
|||||||
Reference in New Issue
Block a user