From c6a9ade586cf0911371f4eea7d54480cc25ad257 Mon Sep 17 00:00:00 2001 From: whekin Date: Sun, 8 Mar 2026 22:44:36 +0400 Subject: [PATCH] feat(ops): add first deployment runbook tooling --- docs/runbooks/dev-setup.md | 3 + docs/runbooks/first-deploy.md | 255 ++++++++++++++++++ .../HOUSEBOT-062-first-deploy-runbook.md | 62 +++++ infra/terraform/README.md | 2 + infra/terraform/main.tf | 3 + infra/terraform/terraform.tfvars.example | 3 + infra/terraform/variables.tf | 6 + package.json | 4 +- scripts/ops/deploy-smoke.ts | 124 +++++++++ scripts/ops/telegram-webhook.ts | 86 ++++++ 10 files changed, 547 insertions(+), 1 deletion(-) create mode 100644 docs/runbooks/first-deploy.md create mode 100644 docs/specs/HOUSEBOT-062-first-deploy-runbook.md create mode 100644 scripts/ops/deploy-smoke.ts create mode 100644 scripts/ops/telegram-webhook.ts diff --git a/docs/runbooks/dev-setup.md b/docs/runbooks/dev-setup.md index 5cb78f5..67f20d5 100644 --- a/docs/runbooks/dev-setup.md +++ b/docs/runbooks/dev-setup.md @@ -26,6 +26,8 @@ bun run db:generate bun run db:check bun run db:migrate bun run db:seed +bun run ops:telegram:webhook info +bun run ops:deploy:smoke bun run infra:fmt:check bun run infra:validate ``` @@ -60,6 +62,7 @@ bun run review:coderabbit - Typed environment validation lives in `packages/config/src/env.ts`. - Copy `.env.example` to `.env` before running app/database commands. - Migration workflow is documented in `docs/runbooks/migrations.md`. +- First deploy flow is documented in `docs/runbooks/first-deploy.md`. ## CI/CD diff --git a/docs/runbooks/first-deploy.md b/docs/runbooks/first-deploy.md new file mode 100644 index 0000000..507f973 --- /dev/null +++ b/docs/runbooks/first-deploy.md @@ -0,0 +1,255 @@ +# First Deployment Runbook + +## Purpose + +Execute the first real deployment with a repeatable sequence that covers infrastructure, secrets, webhook cutover, smoke checks, scheduler rollout, and rollback. + +## Preconditions + +- `main` is green in CI. +- Terraform baseline has already been reviewed for the target environment. +- You have access to: + - GCP project + - GitHub repo settings + - Telegram bot token + - Supabase project and database URL + +## Required Configuration Inventory + +### Terraform variables + +Required in your environment `*.tfvars`: + +- `project_id` +- `region` +- `environment` +- `bot_api_image` +- `mini_app_image` +- `bot_household_id` +- `bot_household_chat_id` +- `bot_purchase_topic_id` + +Recommended: + +- `bot_mini_app_allowed_origins` +- `scheduler_timezone` +- `scheduler_paused = true` +- `scheduler_dry_run = true` + +### Secret Manager values + +Create the secret resources via Terraform, then add secret versions for: + +- `telegram-bot-token` +- `telegram-webhook-secret` +- `scheduler-shared-secret` +- `database-url` +- optional `openai-api-key` +- optional `supabase-url` +- optional `supabase-publishable-key` + +### GitHub Actions secrets + +Required for CD: + +- `GCP_PROJECT_ID` +- `GCP_WORKLOAD_IDENTITY_PROVIDER` +- `GCP_SERVICE_ACCOUNT` + +Recommended: + +- `DATABASE_URL` + +### GitHub Actions variables + +Set if you do not want the defaults: + +- `GCP_REGION` +- `ARTIFACT_REPOSITORY` +- `CLOUD_RUN_SERVICE_BOT` +- `CLOUD_RUN_SERVICE_MINI` + +## Phase 1: Local Readiness + +Run the quality gates locally from the deployment ref: + +```bash +bun run format:check +bun run lint +bun run typecheck +bun run test +bun run build +``` + +If the release includes schema changes, also run: + +```bash +bun run db:check +E2E_SMOKE_ALLOW_WRITE=true bun run test:e2e +``` + +## Phase 2: Provision or Reconcile Infrastructure + +1. Prepare environment-specific variables: + +```bash +cp infra/terraform/terraform.tfvars.example infra/terraform/dev.tfvars +``` + +2. Initialize Terraform with the correct state bucket: + +```bash +terraform -chdir=infra/terraform init -backend-config="bucket=" +``` + +3. Review and apply: + +```bash +terraform -chdir=infra/terraform plan -var-file=dev.tfvars +terraform -chdir=infra/terraform apply -var-file=dev.tfvars +``` + +4. Capture outputs: + +```bash +BOT_API_URL="$(terraform -chdir=infra/terraform output -raw bot_api_service_url)" +MINI_APP_URL="$(terraform -chdir=infra/terraform output -raw mini_app_service_url)" +``` + +5. If you did not know the mini app URL before the first apply, set `bot_mini_app_allowed_origins = [\"${MINI_APP_URL}\"]` in `dev.tfvars` and apply again. + +## Phase 3: Add Runtime Secret Versions + +Use the real project ID from Terraform variables: + +```bash +echo -n "" | gcloud secrets versions add telegram-bot-token --data-file=- --project +echo -n "" | gcloud secrets versions add telegram-webhook-secret --data-file=- --project +echo -n "" | gcloud secrets versions add scheduler-shared-secret --data-file=- --project +echo -n "" | gcloud secrets versions add database-url --data-file=- --project +``` + +Add optional secret versions only if those integrations are enabled. + +## Phase 4: Configure GitHub CD + +Populate GitHub repository secrets with the Terraform outputs: + +- `GCP_PROJECT_ID` +- `GCP_WORKLOAD_IDENTITY_PROVIDER` +- `GCP_SERVICE_ACCOUNT` +- optional `DATABASE_URL` + +If you prefer the GitHub CLI: + +```bash +gh secret set GCP_PROJECT_ID +gh secret set GCP_WORKLOAD_IDENTITY_PROVIDER +gh secret set GCP_SERVICE_ACCOUNT +gh secret set DATABASE_URL +``` + +Set GitHub repository variables if you want to override the defaults used by `.github/workflows/cd.yml`. + +## Phase 5: Trigger the First Deployment + +You have two safe options: + +- Merge the deployment ref into `main` and let `CD` run after successful CI. +- Trigger `CD` manually from the GitHub Actions UI with `workflow_dispatch`. + +The workflow will: + +- optionally run `bun run db:migrate` if `DATABASE_URL` secret is configured +- build and push bot and mini app images +- deploy both Cloud Run services + +## Phase 6: Telegram Webhook Cutover + +After the bot service is live, set the webhook explicitly: + +```bash +export TELEGRAM_BOT_TOKEN="$(gcloud secrets versions access latest --secret telegram-bot-token --project )" +export TELEGRAM_WEBHOOK_SECRET="$(gcloud secrets versions access latest --secret telegram-webhook-secret --project )" +export TELEGRAM_WEBHOOK_URL="${BOT_API_URL}/webhook/telegram" + +bun run ops:telegram:webhook set +bun run ops:telegram:webhook info +``` + +If you want to discard queued updates during cutover: + +```bash +export TELEGRAM_DROP_PENDING_UPDATES=true +bun run ops:telegram:webhook set +``` + +## Phase 7: Post-Deploy Smoke Checks + +Run the smoke script: + +```bash +export BOT_API_URL +export MINI_APP_URL +export TELEGRAM_EXPECTED_WEBHOOK_URL="${BOT_API_URL}/webhook/telegram" + +bun run ops:deploy:smoke +``` + +The smoke script verifies: + +- bot health endpoint +- mini app root delivery +- mini app auth endpoint is mounted +- scheduler endpoint rejects unauthenticated requests +- Telegram webhook matches the expected URL when bot token is provided + +## Phase 8: Scheduler Enablement + +First release: + +1. Keep `scheduler_paused = true` and `scheduler_dry_run = true` on initial deploy. +2. After smoke checks pass, set `scheduler_paused = false` and apply Terraform. +3. Trigger one job manually: + +```bash +gcloud scheduler jobs run household-dev-utilities --location --project +``` + +4. Verify the reminder request succeeded and produced `dryRun: true` logs. +5. Set `scheduler_dry_run = false` and apply Terraform. +6. Trigger one job again and verify the delivery side behaves as expected. + +## Rollback + +If the release is unhealthy: + +1. Pause scheduler jobs again in Terraform: + +```bash +terraform -chdir=infra/terraform apply -var-file=dev.tfvars -var='scheduler_paused=true' +``` + +2. Move Cloud Run traffic back to the last healthy revision: + +```bash +gcloud run revisions list --service --region --project +gcloud run services update-traffic --region --project --to-revisions =100 +gcloud run revisions list --service --region --project +gcloud run services update-traffic --region --project --to-revisions =100 +``` + +3. If webhook traffic must stop immediately: + +```bash +bun run ops:telegram:webhook delete +``` + +4. If migrations were additive, leave schema in place and roll application code back. +5. If a destructive migration failed, stop and use the rollback SQL prepared in that PR. + +## Dev-to-Prod Promotion Notes + +- Repeat the same sequence in a separate `prod.tfvars` and Terraform state. +- Keep separate GCP projects for `dev` and `prod` when possible. +- Do not unpause production scheduler jobs until prod smoke checks are complete. diff --git a/docs/specs/HOUSEBOT-062-first-deploy-runbook.md b/docs/specs/HOUSEBOT-062-first-deploy-runbook.md new file mode 100644 index 0000000..2576c88 --- /dev/null +++ b/docs/specs/HOUSEBOT-062-first-deploy-runbook.md @@ -0,0 +1,62 @@ +# HOUSEBOT-062: First Deployment Runbook and Cutover Checklist + +## Summary + +Document the exact first-deploy sequence so one engineer can provision, deploy, cut over Telegram webhook traffic, validate the runtime, and roll back safely without tribal knowledge. + +## Goals + +- Provide one runbook that covers infrastructure, CD, webhook cutover, smoke checks, and scheduler enablement. +- Close configuration gaps that would otherwise require ad hoc manual fixes. +- Add lightweight operator scripts for webhook management and post-deploy validation. + +## Non-goals + +- Full production monitoring stack. +- Automated blue/green or canary deployment. +- Elimination of all manual steps from first deploy. + +## Scope + +- In: first-deploy runbook, config inventory, smoke scripts, Terraform runtime config needed for deploy safety. +- Out: continuous release automation redesign, incident response handbook. + +## Interfaces and Contracts + +- Operator scripts: + - `bun run ops:telegram:webhook info|set|delete` + - `bun run ops:deploy:smoke` +- Runbook: + - `docs/runbooks/first-deploy.md` +- Terraform runtime config: + - optional `bot_mini_app_allowed_origins` + +## Security and Privacy + +- Webhook setup uses Telegram secret token support. +- Post-deploy validation does not require scheduler auth bypass. +- Mini app origin allow-list is configurable through Terraform instead of ad hoc runtime mutation. + +## Observability + +- Smoke checks verify bot health, mounted app routes, and Telegram webhook state. +- Runbook includes explicit verification before scheduler jobs are unpaused. + +## Edge Cases and Failure Modes + +- First Terraform apply may not know the final mini app URL; runbook includes a second apply to set allowed origins. +- Missing `DATABASE_URL` in GitHub secrets skips migration automation. +- Scheduler jobs remain paused and dry-run by default to prevent accidental sends. + +## Test Plan + +- Unit: script typecheck through workspace `typecheck`. +- Integration: `bun run format:check`, `bun run lint`, `bun run typecheck`, `bun run test`, `bun run build`, `bun run infra:validate`. +- Manual: execute the runbook in dev before prod cutover. + +## Acceptance Criteria + +- [ ] A single runbook describes the full first deploy flow. +- [ ] Required secrets, vars, and Terraform values are enumerated. +- [ ] Webhook cutover and smoke checks are script-assisted. +- [ ] Rollback steps are explicit and environment-safe. diff --git a/infra/terraform/README.md b/infra/terraform/README.md index 2756978..0fa8bf3 100644 --- a/infra/terraform/README.md +++ b/infra/terraform/README.md @@ -73,6 +73,7 @@ Recommended approach: - `bot_household_chat_id` - `bot_purchase_topic_id` - optional `bot_parser_model` + - optional `bot_mini_app_allowed_origins` ## CI validation @@ -86,3 +87,4 @@ CI runs: - Scheduler jobs default to `paused = true` and `dry_run = true` to prevent accidental sends before live reminder delivery is ready. - Bot API is public to accept Telegram webhooks; scheduler endpoint should still verify app-level auth. +- `bot_mini_app_allowed_origins` cannot be auto-derived in Terraform because the bot and mini app Cloud Run services reference each other; set it explicitly once the mini app URL is known. diff --git a/infra/terraform/main.tf b/infra/terraform/main.tf index 5383f1c..61b932c 100644 --- a/infra/terraform/main.tf +++ b/infra/terraform/main.tf @@ -93,6 +93,9 @@ module "bot_api_service" { var.bot_parser_model == null ? {} : { PARSER_MODEL = var.bot_parser_model }, + length(var.bot_mini_app_allowed_origins) == 0 ? {} : { + MINI_APP_ALLOWED_ORIGINS = join(",", var.bot_mini_app_allowed_origins) + }, { SCHEDULER_OIDC_ALLOWED_EMAILS = google_service_account.scheduler_invoker.email } diff --git a/infra/terraform/terraform.tfvars.example b/infra/terraform/terraform.tfvars.example index e8c3efd..0ac34c4 100644 --- a/infra/terraform/terraform.tfvars.example +++ b/infra/terraform/terraform.tfvars.example @@ -12,6 +12,9 @@ bot_household_id = "11111111-1111-4111-8111-111111111111" bot_household_chat_id = "-1001234567890" bot_purchase_topic_id = 777 bot_parser_model = "gpt-4.1-mini" +bot_mini_app_allowed_origins = [ + "https://household-dev-mini-app-abc123-ew.a.run.app" +] scheduler_utilities_cron = "0 9 4 * *" scheduler_rent_warning_cron = "0 9 17 * *" diff --git a/infra/terraform/variables.tf b/infra/terraform/variables.tf index 2d7b508..c54b679 100644 --- a/infra/terraform/variables.tf +++ b/infra/terraform/variables.tf @@ -111,6 +111,12 @@ variable "bot_parser_model" { nullable = true } +variable "bot_mini_app_allowed_origins" { + description = "Optional allow-list of mini app origins for bot CORS handling" + type = list(string) + default = [] +} + variable "openai_api_key_secret_id" { description = "Optional Secret Manager ID for OPENAI_API_KEY" type = string diff --git a/package.json b/package.json index 8b2025e..9d52fc5 100644 --- a/package.json +++ b/package.json @@ -32,7 +32,9 @@ "docker:build:miniapp": "docker build -f apps/miniapp/Dockerfile -t household-miniapp:local .", "docker:build": "bun run docker:build:bot && bun run docker:build:miniapp", "docker:smoke": "docker compose up --build", - "test:e2e": "bun run scripts/e2e/billing-flow.ts" + "test:e2e": "bun run scripts/e2e/billing-flow.ts", + "ops:deploy:smoke": "bun run scripts/ops/deploy-smoke.ts", + "ops:telegram:webhook": "bun run scripts/ops/telegram-webhook.ts" }, "devDependencies": { "@types/bun": "1.3.10", diff --git a/scripts/ops/deploy-smoke.ts b/scripts/ops/deploy-smoke.ts new file mode 100644 index 0000000..07edaff --- /dev/null +++ b/scripts/ops/deploy-smoke.ts @@ -0,0 +1,124 @@ +function requireEnv(name: string): string { + const value = process.env[name]?.trim() + if (!value) { + throw new Error(`${name} is required`) + } + + return value +} + +function toUrl(base: string, path: string): URL { + const normalizedBase = base.endsWith('/') ? base : `${base}/` + return new URL(path.replace(/^\//, ''), normalizedBase) +} + +async function expectJson(url: URL, init: RequestInit, expectedStatus: number): Promise { + const response = await fetch(url, init) + const text = await response.text() + const payload = (text.length > 0 ? JSON.parse(text) : null) as unknown + + if (response.status !== expectedStatus) { + throw new Error( + `${url.toString()} expected ${expectedStatus}, received ${response.status}: ${text}` + ) + } + + return payload +} + +async function fetchWebhookInfo(botToken: string): Promise { + const response = await fetch(`https://api.telegram.org/bot${botToken}/getWebhookInfo`) + const payload = (await response.json()) as { + ok?: boolean + result?: unknown + } + + if (!response.ok || payload.ok !== true) { + throw new Error(`Telegram getWebhookInfo failed: ${JSON.stringify(payload)}`) + } + + return payload.result +} + +async function run(): Promise { + const botApiUrl = requireEnv('BOT_API_URL') + const miniAppUrl = requireEnv('MINI_APP_URL') + + const health = await expectJson(toUrl(botApiUrl, '/healthz'), {}, 200) + if (health?.ok !== true) { + throw new Error('Bot health check returned unexpected payload') + } + + await expectJson( + toUrl(botApiUrl, '/api/miniapp/session'), + { + method: 'POST', + headers: { + 'content-type': 'application/json' + }, + body: JSON.stringify({}) + }, + 400 + ) + + await expectJson( + toUrl(botApiUrl, '/jobs/reminder/utilities'), + { + method: 'POST', + headers: { + 'content-type': 'application/json' + }, + body: JSON.stringify({}) + }, + 401 + ) + + const miniAppResponse = await fetch(miniAppUrl) + const miniAppHtml = await miniAppResponse.text() + if (!miniAppResponse.ok) { + throw new Error(`Mini app root returned ${miniAppResponse.status}`) + } + if (!miniAppHtml.includes('/config.js')) { + throw new Error('Mini app root does not reference runtime config') + } + + const telegramBotToken = process.env.TELEGRAM_BOT_TOKEN?.trim() + const expectedWebhookUrl = process.env.TELEGRAM_EXPECTED_WEBHOOK_URL?.trim() + + if (telegramBotToken && expectedWebhookUrl) { + const webhookInfo = await fetchWebhookInfo(telegramBotToken) + + if (webhookInfo.url !== expectedWebhookUrl) { + throw new Error( + `Telegram webhook mismatch: expected ${expectedWebhookUrl}, received ${webhookInfo.url}` + ) + } + + if ( + typeof webhookInfo.last_error_message === 'string' && + webhookInfo.last_error_message.length > 0 + ) { + throw new Error( + `Telegram webhook reports last_error_message=${webhookInfo.last_error_message}` + ) + } + } + + console.log( + JSON.stringify( + { + ok: true, + botApiUrl, + miniAppUrl, + checkedWebhook: telegramBotToken !== undefined && expectedWebhookUrl !== undefined + }, + null, + 2 + ) + ) +} + +run().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)) + process.exitCode = 1 +}) diff --git a/scripts/ops/telegram-webhook.ts b/scripts/ops/telegram-webhook.ts new file mode 100644 index 0000000..26b40ab --- /dev/null +++ b/scripts/ops/telegram-webhook.ts @@ -0,0 +1,86 @@ +type WebhookCommand = 'info' | 'set' | 'delete' + +function requireEnv(name: string): string { + const value = process.env[name]?.trim() + if (!value) { + throw new Error(`${name} is required`) + } + + return value +} + +async function telegramRequest( + botToken: string, + method: string, + body?: URLSearchParams +): Promise { + const response = await fetch(`https://api.telegram.org/bot${botToken}/${method}`, { + method: body ? 'POST' : 'GET', + body + }) + + const payload = (await response.json()) as { + ok?: boolean + result?: unknown + } + if (!response.ok || payload.ok !== true) { + throw new Error(`Telegram ${method} failed: ${JSON.stringify(payload)}`) + } + + return payload.result +} + +async function run(): Promise { + const command = (process.argv[2] ?? 'info') as WebhookCommand + const botToken = requireEnv('TELEGRAM_BOT_TOKEN') + + switch (command) { + case 'info': { + const result = await telegramRequest(botToken, 'getWebhookInfo') + console.log(JSON.stringify(result, null, 2)) + return + } + case 'set': { + const params = new URLSearchParams({ + url: requireEnv('TELEGRAM_WEBHOOK_URL') + }) + + const secretToken = process.env.TELEGRAM_WEBHOOK_SECRET?.trim() + if (secretToken) { + params.set('secret_token', secretToken) + } + + const maxConnections = process.env.TELEGRAM_MAX_CONNECTIONS?.trim() + if (maxConnections) { + params.set('max_connections', maxConnections) + } + + const dropPendingUpdates = process.env.TELEGRAM_DROP_PENDING_UPDATES?.trim() + if (dropPendingUpdates) { + params.set('drop_pending_updates', dropPendingUpdates) + } + + const result = await telegramRequest(botToken, 'setWebhook', params) + console.log(JSON.stringify({ ok: true, result }, null, 2)) + return + } + case 'delete': { + const params = new URLSearchParams() + const dropPendingUpdates = process.env.TELEGRAM_DROP_PENDING_UPDATES?.trim() + if (dropPendingUpdates) { + params.set('drop_pending_updates', dropPendingUpdates) + } + + const result = await telegramRequest(botToken, 'deleteWebhook', params) + console.log(JSON.stringify({ ok: true, result }, null, 2)) + return + } + default: + throw new Error(`Unsupported command: ${command}`) + } +} + +run().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)) + process.exitCode = 1 +})