From 3d157549364e7e599f0d1ec822b352abc9a3736a Mon Sep 17 00:00:00 2001 From: whekin Date: Mon, 16 Mar 2026 05:17:47 +0400 Subject: [PATCH] fix(ci): push latest tag and reduce monitoring costs - Add latest tag push alongside SHA tag for manual rollback/debugging - Reduce log retention from 3 to 1 day - Comment out bot_error_metrics and alerts to save ~$0.47/month - Minor whitespace fix in cd.yml --- .github/workflows/cd.yml | 2 +- .github/workflows/ci.yml | 5 +- infra/terraform/monitoring.tf | 140 +++++++++++++++++----------------- 3 files changed, 77 insertions(+), 70 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 6f9df7d..f31bf79 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -187,7 +187,7 @@ jobs: git fetch origin "${{ github.event.workflow_run.head_branch }}" latest_sha=$(git rev-parse "origin/${{ github.event.workflow_run.head_branch }}") deploy_sha="${{ steps.images.outputs.deploy_sha }}" - + if [[ "$latest_sha" != "$deploy_sha" ]]; then echo "::notice::Newer commit ($latest_sha) found on branch. Skipping deployment of $deploy_sha to avoid race conditions." echo "skip=true" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e1b3b1b..9cc574a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -160,6 +160,7 @@ jobs: repo="${GCP_REGION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID }}/${ARTIFACT_REPOSITORY}" echo "name=${repo}/${{ matrix.service }}:${GITHUB_SHA}" >> "$GITHUB_OUTPUT" echo "cache_ref=${repo}/${{ matrix.service }}:cache" >> "$GITHUB_OUTPUT" + echo "latest=${repo}/${{ matrix.service }}:latest" >> "$GITHUB_OUTPUT" - name: Build and push uses: docker/build-push-action@v6 @@ -167,7 +168,9 @@ jobs: context: . file: apps/${{ matrix.service }}/Dockerfile push: true - tags: ${{ steps.image.outputs.name }} + tags: | + ${{ steps.image.outputs.name }} + ${{ steps.image.outputs.latest }} platforms: linux/amd64 provenance: false cache-from: type=registry,ref=${{ steps.image.outputs.cache_ref }} diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf index 90e1b17..ff84ac6 100644 --- a/infra/terraform/monitoring.tf +++ b/infra/terraform/monitoring.tf @@ -51,7 +51,7 @@ resource "google_logging_project_bucket_config" "default" { project = var.project_id location = "global" bucket_id = "_Default" - retention_days = 3 + retention_days = 1 } resource "google_monitoring_notification_channel" "email" { @@ -68,26 +68,28 @@ resource "google_monitoring_notification_channel" "email" { depends_on = [google_project_service.enabled] } -resource "google_logging_metric" "bot_error_events" { - for_each = local.bot_error_metrics - - project = var.project_id - name = each.value.metric_name - description = "Counts `${each.value.event}` log events for ${module.bot_api_service.name}." - filter = <<-EOT -resource.type="cloud_run_revision" -resource.labels.service_name="${module.bot_api_service.name}" -jsonPayload.event="${each.value.event}" - EOT - - metric_descriptor { - metric_kind = "DELTA" - value_type = "INT64" - unit = "1" - } - - depends_on = [google_project_service.enabled] -} +# DEV-187: Commented out to save ~$0.47/month on Cloud Monitoring costs +# TODO: Re-enable if alerting on specific bot error events becomes necessary +# resource "google_logging_metric" "bot_error_events" { +# for_each = local.bot_error_metrics +# +# project = var.project_id +# name = each.value.metric_name +# description = "Counts `${each.value.event}` log events for ${module.bot_api_service.name}." +# filter = <<-EOT +# resource.type="cloud_run_revision" +# resource.labels.service_name="${module.bot_api_service.name}" +# jsonPayload.event="${each.value.event}" +# EOT +# +# metric_descriptor { +# metric_kind = "DELTA" +# value_type = "INT64" +# unit = "1" +# } +# +# depends_on = [google_project_service.enabled] +# } resource "google_monitoring_alert_policy" "bot_api_5xx" { project = var.project_id @@ -138,50 +140,52 @@ metric.labels.response_code_class="5xx" depends_on = [google_project_service.enabled] } -resource "google_monitoring_alert_policy" "bot_error_events" { - for_each = local.bot_error_metrics - - project = var.project_id - display_name = each.value.display_name - combiner = "OR" - - notification_channels = [ - for channel in google_monitoring_notification_channel.email : channel.name - ] - - documentation { - content = "Structured bot failure event `${each.value.event}` was logged by `${module.bot_api_service.name}` in `${var.environment}`." - mime_type = "text/markdown" - } - - conditions { - display_name = each.value.display_name - - condition_threshold { - filter = <<-EOT -resource.type="cloud_run_revision" -resource.labels.service_name="${module.bot_api_service.name}" -metric.type="logging.googleapis.com/user/${google_logging_metric.bot_error_events[each.key].name}" - EOT - - comparison = "COMPARISON_GT" - threshold_value = 0 - duration = "0s" - - aggregations { - alignment_period = "300s" - per_series_aligner = "ALIGN_RATE" - } - - trigger { - count = 1 - } - } - } - - alert_strategy { - auto_close = "1800s" - } - - depends_on = [google_logging_metric.bot_error_events] -} +# DEV-187: Commented out to save ~$0.47/month on Cloud Monitoring costs +# TODO: Re-enable if alerting on specific bot error events becomes necessary +# resource "google_monitoring_alert_policy" "bot_error_events" { +# for_each = local.bot_error_metrics +# +# project = var.project_id +# display_name = each.value.display_name +# combiner = "OR" +# +# notification_channels = [ +# for channel in google_monitoring_notification_channel.email : channel.name +# ] +# +# documentation { +# content = "Structured bot failure event `${each.value.event}` was logged by `${module.bot_api_service.name}` in `${var.environment}`." +# mime_type = "text/markdown" +# } +# +# conditions { +# display_name = each.value.display_name +# +# condition_threshold { +# filter = <<-EOT +# resource.type="cloud_run_revision" +# resource.labels.service_name="${module.bot_api_service.name}" +# metric.type="logging.googleapis.com/user/${google_logging_metric.bot_error_events[each.key].name}" +# EOT +# +# comparison = "COMPARISON_GT" +# threshold_value = 0 +# duration = "0s" +# +# aggregations { +# alignment_period = "300s" +# per_series_aligner = "ALIGN_RATE" +# } +# +# trigger { +# count = 1 +# } +# } +# } +# +# alert_strategy { +# auto_close = "1800s" +# } +# +# depends_on = [google_logging_metric.bot_error_events] +# }