feat(WHE-23): add hybrid purchase parser with persisted parse metadata

This commit is contained in:
2026-03-05 04:43:57 +04:00
parent 3b1b6468db
commit ebb6ce4ce6
14 changed files with 1881 additions and 7 deletions

View File

@@ -10,6 +10,7 @@
"lint": "oxlint \"src\"" "lint": "oxlint \"src\""
}, },
"dependencies": { "dependencies": {
"@household/application": "workspace:*",
"@household/db": "workspace:*", "@household/db": "workspace:*",
"drizzle-orm": "^0.44.7", "drizzle-orm": "^0.44.7",
"grammy": "1.41.1" "grammy": "1.41.1"

View File

@@ -8,6 +8,8 @@ export interface BotRuntimeConfig {
telegramHouseholdChatId?: string telegramHouseholdChatId?: string
telegramPurchaseTopicId?: number telegramPurchaseTopicId?: number
purchaseTopicIngestionEnabled: boolean purchaseTopicIngestionEnabled: boolean
openaiApiKey?: string
parserModel: string
} }
function parsePort(raw: string | undefined): number { function parsePort(raw: string | undefined): number {
@@ -66,7 +68,8 @@ export function getBotRuntimeConfig(env: NodeJS.ProcessEnv = process.env): BotRu
telegramBotToken: requireValue(env.TELEGRAM_BOT_TOKEN, 'TELEGRAM_BOT_TOKEN'), telegramBotToken: requireValue(env.TELEGRAM_BOT_TOKEN, 'TELEGRAM_BOT_TOKEN'),
telegramWebhookSecret: requireValue(env.TELEGRAM_WEBHOOK_SECRET, 'TELEGRAM_WEBHOOK_SECRET'), telegramWebhookSecret: requireValue(env.TELEGRAM_WEBHOOK_SECRET, 'TELEGRAM_WEBHOOK_SECRET'),
telegramWebhookPath: env.TELEGRAM_WEBHOOK_PATH ?? '/webhook/telegram', telegramWebhookPath: env.TELEGRAM_WEBHOOK_PATH ?? '/webhook/telegram',
purchaseTopicIngestionEnabled purchaseTopicIngestionEnabled,
parserModel: env.PARSER_MODEL?.trim() || 'gpt-4.1-mini'
} }
if (databaseUrl !== undefined) { if (databaseUrl !== undefined) {
@@ -81,6 +84,10 @@ export function getBotRuntimeConfig(env: NodeJS.ProcessEnv = process.env): BotRu
if (telegramPurchaseTopicId !== undefined) { if (telegramPurchaseTopicId !== undefined) {
runtime.telegramPurchaseTopicId = telegramPurchaseTopicId runtime.telegramPurchaseTopicId = telegramPurchaseTopicId
} }
const openaiApiKey = parseOptionalValue(env.OPENAI_API_KEY)
if (openaiApiKey !== undefined) {
runtime.openaiApiKey = openaiApiKey
}
return runtime return runtime
} }

View File

@@ -2,6 +2,7 @@ import { webhookCallback } from 'grammy'
import { createTelegramBot } from './bot' import { createTelegramBot } from './bot'
import { getBotRuntimeConfig } from './config' import { getBotRuntimeConfig } from './config'
import { createOpenAiParserFallback } from './openai-parser-fallback'
import { import {
createPurchaseMessageRepository, createPurchaseMessageRepository,
registerPurchaseTopicIngestion registerPurchaseTopicIngestion
@@ -17,6 +18,7 @@ let closePurchaseRepository: (() => Promise<void>) | undefined
if (runtime.purchaseTopicIngestionEnabled) { if (runtime.purchaseTopicIngestionEnabled) {
const purchaseRepositoryClient = createPurchaseMessageRepository(runtime.databaseUrl!) const purchaseRepositoryClient = createPurchaseMessageRepository(runtime.databaseUrl!)
closePurchaseRepository = purchaseRepositoryClient.close closePurchaseRepository = purchaseRepositoryClient.close
const llmFallback = createOpenAiParserFallback(runtime.openaiApiKey, runtime.parserModel)
registerPurchaseTopicIngestion( registerPurchaseTopicIngestion(
bot, bot,
@@ -25,7 +27,12 @@ if (runtime.purchaseTopicIngestionEnabled) {
householdChatId: runtime.telegramHouseholdChatId!, householdChatId: runtime.telegramHouseholdChatId!,
purchaseTopicId: runtime.telegramPurchaseTopicId! purchaseTopicId: runtime.telegramPurchaseTopicId!
}, },
purchaseRepositoryClient.repository purchaseRepositoryClient.repository,
llmFallback
? {
llmFallback
}
: {}
) )
} else { } else {
console.warn( console.warn(

View File

@@ -0,0 +1,119 @@
import type { PurchaseParserLlmFallback } from '@household/application'
interface OpenAiStructuredResult {
amountMinor: string
currency: 'GEL' | 'USD'
itemDescription: string
confidence: number
needsReview: boolean
}
function asBigInt(value: string): bigint | null {
if (!/^[0-9]+$/.test(value)) {
return null
}
const parsed = BigInt(value)
return parsed > 0n ? parsed : null
}
export function createOpenAiParserFallback(
apiKey: string | undefined,
model: string
): PurchaseParserLlmFallback | undefined {
if (!apiKey) {
return undefined
}
return async (rawText: string) => {
const response = await fetch('https://api.openai.com/v1/responses', {
method: 'POST',
headers: {
authorization: `Bearer ${apiKey}`,
'content-type': 'application/json'
},
body: JSON.stringify({
model,
input: [
{
role: 'system',
content:
'Extract a shared household purchase from text. Return only valid JSON with amountMinor, currency, itemDescription, confidence, needsReview.'
},
{
role: 'user',
content: rawText
}
],
text: {
format: {
type: 'json_schema',
name: 'purchase_parse',
schema: {
type: 'object',
additionalProperties: false,
properties: {
amountMinor: {
type: 'string'
},
currency: {
type: 'string',
enum: ['GEL', 'USD']
},
itemDescription: {
type: 'string'
},
confidence: {
type: 'number',
minimum: 0,
maximum: 100
},
needsReview: {
type: 'boolean'
}
},
required: ['amountMinor', 'currency', 'itemDescription', 'confidence', 'needsReview']
}
}
}
})
})
if (!response.ok) {
return null
}
const payload = (await response.json()) as {
output_text?: string
}
if (!payload.output_text) {
return null
}
let parsedJson: OpenAiStructuredResult
try {
parsedJson = JSON.parse(payload.output_text) as OpenAiStructuredResult
} catch {
return null
}
const amountMinor = asBigInt(parsedJson.amountMinor)
if (!amountMinor) {
return null
}
if (parsedJson.itemDescription.trim().length === 0) {
return null
}
return {
amountMinor,
currency: parsedJson.currency,
itemDescription: parsedJson.itemDescription,
confidence: Math.max(0, Math.min(100, Math.round(parsedJson.confidence))),
parserMode: 'llm',
needsReview: parsedJson.needsReview
}
}
}

View File

@@ -1,3 +1,4 @@
import { parsePurchaseMessage, type PurchaseParserLlmFallback } from '@household/application'
import { and, eq } from 'drizzle-orm' import { and, eq } from 'drizzle-orm'
import type { Bot, Context } from 'grammy' import type { Bot, Context } from 'grammy'
@@ -25,7 +26,10 @@ export interface PurchaseTopicRecord extends PurchaseTopicCandidate {
} }
export interface PurchaseMessageIngestionRepository { export interface PurchaseMessageIngestionRepository {
save(record: PurchaseTopicRecord): Promise<'created' | 'duplicate'> save(
record: PurchaseTopicRecord,
llmFallback?: PurchaseParserLlmFallback
): Promise<'created' | 'duplicate'>
} }
export function extractPurchaseTopicCandidate( export function extractPurchaseTopicCandidate(
@@ -52,6 +56,10 @@ export function extractPurchaseTopicCandidate(
} }
} }
function needsReviewAsInt(value: boolean): number {
return value ? 1 : 0
}
export function createPurchaseMessageRepository(databaseUrl: string): { export function createPurchaseMessageRepository(databaseUrl: string): {
repository: PurchaseMessageIngestionRepository repository: PurchaseMessageIngestionRepository
close: () => Promise<void> close: () => Promise<void>
@@ -62,7 +70,7 @@ export function createPurchaseMessageRepository(databaseUrl: string): {
}) })
const repository: PurchaseMessageIngestionRepository = { const repository: PurchaseMessageIngestionRepository = {
async save(record) { async save(record, llmFallback) {
const matchedMember = await db const matchedMember = await db
.select({ id: schema.members.id }) .select({ id: schema.members.id })
.from(schema.members) .from(schema.members)
@@ -75,6 +83,30 @@ export function createPurchaseMessageRepository(databaseUrl: string): {
.limit(1) .limit(1)
const senderMemberId = matchedMember[0]?.id ?? null const senderMemberId = matchedMember[0]?.id ?? null
let parserError: string | null = null
const parsed = await parsePurchaseMessage(
{
rawText: record.rawText
},
llmFallback
? {
llmFallback
}
: {}
).catch((error) => {
parserError = error instanceof Error ? error.message : 'Unknown parser error'
return null
})
const processingStatus =
parserError !== null
? 'parse_failed'
: parsed === null
? 'needs_review'
: parsed.needsReview
? 'needs_review'
: 'parsed'
const inserted = await db const inserted = await db
.insert(schema.purchaseMessages) .insert(schema.purchaseMessages)
@@ -89,7 +121,14 @@ export function createPurchaseMessageRepository(databaseUrl: string): {
telegramThreadId: record.threadId, telegramThreadId: record.threadId,
telegramUpdateId: String(record.updateId), telegramUpdateId: String(record.updateId),
messageSentAt: record.messageSentAt, messageSentAt: record.messageSentAt,
processingStatus: 'pending' parsedAmountMinor: parsed?.amountMinor,
parsedCurrency: parsed?.currency,
parsedItemDescription: parsed?.itemDescription,
parserMode: parsed?.parserMode,
parserConfidence: parsed?.confidence,
needsReview: needsReviewAsInt(parsed?.needsReview ?? true),
parserError,
processingStatus
}) })
.onConflictDoNothing({ .onConflictDoNothing({
target: [ target: [
@@ -151,7 +190,10 @@ function toCandidateFromContext(ctx: Context): PurchaseTopicCandidate | null {
export function registerPurchaseTopicIngestion( export function registerPurchaseTopicIngestion(
bot: Bot, bot: Bot,
config: PurchaseTopicIngestionConfig, config: PurchaseTopicIngestionConfig,
repository: PurchaseMessageIngestionRepository repository: PurchaseMessageIngestionRepository,
options: {
llmFallback?: PurchaseParserLlmFallback
} = {}
): void { ): void {
bot.on('message:text', async (ctx) => { bot.on('message:text', async (ctx) => {
const candidate = toCandidateFromContext(ctx) const candidate = toCandidateFromContext(ctx)
@@ -165,7 +207,7 @@ export function registerPurchaseTopicIngestion(
} }
try { try {
const status = await repository.save(record) const status = await repository.save(record, options.llmFallback)
if (status === 'created') { if (status === 'created') {
console.log( console.log(

View File

@@ -15,6 +15,7 @@
"apps/bot": { "apps/bot": {
"name": "@household/bot", "name": "@household/bot",
"dependencies": { "dependencies": {
"@household/application": "workspace:*",
"@household/db": "workspace:*", "@household/db": "workspace:*",
"drizzle-orm": "^0.44.7", "drizzle-orm": "^0.44.7",
"grammy": "1.41.1", "grammy": "1.41.1",

View File

@@ -0,0 +1,80 @@
# HOUSEBOT-022: Hybrid Purchase Parser
## Summary
Implement a rules-first purchase parser with optional LLM fallback for ambiguous Telegram purchase messages.
## Goals
- Parse common RU/EN purchase text with deterministic regex rules first.
- Call LLM fallback only when rules cannot safely resolve a single amount.
- Persist raw + parsed fields + confidence + parser mode.
## Non-goals
- Receipt OCR.
- Complex multi-item itemization.
## Scope
- In: parser core logic, fallback interface, bot ingestion integration, DB fields for parser output.
- Out: settlement posting and command UIs.
## Interfaces and Contracts
- `parsePurchaseMessage({ rawText }, { llmFallback? })`
- Parser result fields:
- `amountMinor`
- `currency`
- `itemDescription`
- `confidence`
- `parserMode` (`rules` | `llm`)
- `needsReview`
## Domain Rules
- Rules parser attempts single-amount extraction first.
- Missing currency defaults to GEL and marks `needsReview=true`.
- Ambiguous text (multiple amounts) triggers LLM fallback if configured.
## Data Model Changes
- `purchase_messages` stores parsed fields:
- `parsed_amount_minor`
- `parsed_currency`
- `parsed_item_description`
- `parser_mode`
- `parser_confidence`
- `needs_review`
- `parser_error`
## Security and Privacy
- LLM fallback sends only minimal raw text needed for parsing.
- API key required for fallback path.
## Observability
- `processing_status` and `parser_error` capture parse outcomes.
## Edge Cases and Failure Modes
- Empty message text.
- Multiple numeric amounts.
- Invalid LLM output payload.
- Missing API key disables LLM fallback.
## Test Plan
- Unit tests for rules parser and fallback behavior.
- Ingestion tests for topic filter remain valid.
## Acceptance Criteria
- [ ] Rules parser handles common message patterns.
- [ ] LLM fallback is invoked only when rules are insufficient.
- [ ] Parsed result + confidence + parser mode persisted.
## Rollout Plan
- Enable in dev group and monitor `needs_review` rate before stricter auto-accept rules.

View File

@@ -1 +1,9 @@
export { calculateMonthlySettlement } from './settlement-engine' export { calculateMonthlySettlement } from './settlement-engine'
export {
parsePurchaseMessage,
type ParsedPurchaseResult,
type ParsePurchaseInput,
type ParsePurchaseOptions,
type PurchaseParserLlmFallback,
type PurchaseParserMode
} from './purchase-parser'

View File

@@ -0,0 +1,63 @@
import { describe, expect, test } from 'bun:test'
import { parsePurchaseMessage } from './purchase-parser'
describe('parsePurchaseMessage', () => {
test('parses explicit currency with rules', async () => {
const result = await parsePurchaseMessage({
rawText: 'Купил туалетную бумагу 30 gel'
})
expect(result).not.toBeNull()
expect(result?.amountMinor).toBe(3000n)
expect(result?.currency).toBe('GEL')
expect(result?.parserMode).toBe('rules')
expect(result?.needsReview).toBe(false)
})
test('defaults to GEL when currency is omitted and marks review', async () => {
const result = await parsePurchaseMessage({
rawText: 'Bought soap 12.5'
})
expect(result).not.toBeNull()
expect(result?.amountMinor).toBe(1250n)
expect(result?.currency).toBe('GEL')
expect(result?.needsReview).toBe(true)
})
test('uses llm fallback for ambiguous message with multiple amounts', async () => {
const result = await parsePurchaseMessage(
{
rawText: 'Купил пасту 10 и мыло 5'
},
{
llmFallback: async () => ({
amountMinor: 1500n,
currency: 'GEL',
itemDescription: 'паста и мыло',
confidence: 67,
parserMode: 'llm',
needsReview: true
})
}
)
expect(result).not.toBeNull()
expect(result?.parserMode).toBe('llm')
expect(result?.amountMinor).toBe(1500n)
})
test('returns null when both rules and llm fail', async () => {
const result = await parsePurchaseMessage(
{
rawText: 'без суммы вообще'
},
{
llmFallback: async () => null
}
)
expect(result).toBeNull()
})
})

View File

@@ -0,0 +1,132 @@
export type PurchaseParserMode = 'rules' | 'llm'
export interface ParsedPurchaseResult {
amountMinor: bigint
currency: 'GEL' | 'USD'
itemDescription: string
confidence: number
parserMode: PurchaseParserMode
needsReview: boolean
}
export type PurchaseParserLlmFallback = (rawText: string) => Promise<ParsedPurchaseResult | null>
export interface ParsePurchaseInput {
rawText: string
}
export interface ParsePurchaseOptions {
llmFallback?: PurchaseParserLlmFallback
}
const CURRENCY_PATTERN = '(?:₾|gel|lari|лари|usd|\\$|доллар(?:а|ов)?)'
const AMOUNT_WITH_OPTIONAL_CURRENCY = new RegExp(
`(?<amount>\\d+(?:[.,]\\d{1,2})?)\\s*(?<currency>${CURRENCY_PATTERN})?`,
'giu'
)
function normalizeCurrency(raw: string | undefined): 'GEL' | 'USD' | null {
if (!raw) {
return null
}
const value = raw.trim().toLowerCase()
if (value === '₾' || value === 'gel' || value === 'lari' || value === 'лари') {
return 'GEL'
}
if (value === 'usd' || value === '$' || value.startsWith('доллар')) {
return 'USD'
}
return null
}
function toMinorUnits(rawAmount: string): bigint {
const normalized = rawAmount.replace(',', '.')
const [wholePart, fractionalPart = ''] = normalized.split('.')
const cents = fractionalPart.padEnd(2, '0').slice(0, 2)
return BigInt(`${wholePart}${cents}`)
}
function normalizeDescription(rawText: string, matchedFragment: string): string {
const cleaned = rawText.replace(matchedFragment, ' ').replace(/\s+/g, ' ').trim()
if (cleaned.length === 0) {
return 'shared purchase'
}
return cleaned
}
function parseWithRules(rawText: string): ParsedPurchaseResult | null {
const matches = Array.from(rawText.matchAll(AMOUNT_WITH_OPTIONAL_CURRENCY))
if (matches.length !== 1) {
return null
}
const [match] = matches
if (!match?.groups?.amount) {
return null
}
const currency = normalizeCurrency(match.groups.currency)
const amountMinor = toMinorUnits(match.groups.amount)
const explicitCurrency = currency !== null
const resolvedCurrency = currency ?? 'GEL'
const confidence = explicitCurrency ? 92 : 78
return {
amountMinor,
currency: resolvedCurrency,
itemDescription: normalizeDescription(rawText, match[0] ?? ''),
confidence,
parserMode: 'rules',
needsReview: !explicitCurrency
}
}
function validateLlmResult(result: ParsedPurchaseResult | null): ParsedPurchaseResult | null {
if (!result) {
return null
}
if (result.amountMinor <= 0n) {
return null
}
if (result.confidence < 0 || result.confidence > 100) {
return null
}
if (result.itemDescription.trim().length === 0) {
return null
}
return result
}
export async function parsePurchaseMessage(
input: ParsePurchaseInput,
options: ParsePurchaseOptions = {}
): Promise<ParsedPurchaseResult | null> {
const rawText = input.rawText.trim()
if (rawText.length === 0) {
return null
}
const rulesResult = parseWithRules(rawText)
if (rulesResult) {
return rulesResult
}
if (!options.llmFallback) {
return null
}
const llmResult = await options.llmFallback(rawText)
return validateLlmResult(llmResult)
}

View File

@@ -0,0 +1,7 @@
ALTER TABLE "purchase_messages" ADD COLUMN "parsed_amount_minor" bigint;--> statement-breakpoint
ALTER TABLE "purchase_messages" ADD COLUMN "parsed_currency" text;--> statement-breakpoint
ALTER TABLE "purchase_messages" ADD COLUMN "parsed_item_description" text;--> statement-breakpoint
ALTER TABLE "purchase_messages" ADD COLUMN "parser_mode" text;--> statement-breakpoint
ALTER TABLE "purchase_messages" ADD COLUMN "parser_confidence" integer;--> statement-breakpoint
ALTER TABLE "purchase_messages" ADD COLUMN "needs_review" integer DEFAULT 1 NOT NULL;--> statement-breakpoint
ALTER TABLE "purchase_messages" ADD COLUMN "parser_error" text;

File diff suppressed because it is too large Load Diff

View File

@@ -22,6 +22,13 @@
"when": 1772670548136, "when": 1772670548136,
"tag": "0002_tough_sandman", "tag": "0002_tough_sandman",
"breakpoints": true "breakpoints": true
},
{
"idx": 3,
"version": "7",
"when": 1772671128084,
"tag": "0003_mature_roulette",
"breakpoints": true
} }
] ]
} }

View File

@@ -198,6 +198,13 @@ export const purchaseMessages = pgTable(
telegramThreadId: text('telegram_thread_id').notNull(), telegramThreadId: text('telegram_thread_id').notNull(),
telegramUpdateId: text('telegram_update_id').notNull(), telegramUpdateId: text('telegram_update_id').notNull(),
messageSentAt: timestamp('message_sent_at', { withTimezone: true }), messageSentAt: timestamp('message_sent_at', { withTimezone: true }),
parsedAmountMinor: bigint('parsed_amount_minor', { mode: 'bigint' }),
parsedCurrency: text('parsed_currency'),
parsedItemDescription: text('parsed_item_description'),
parserMode: text('parser_mode'),
parserConfidence: integer('parser_confidence'),
needsReview: integer('needs_review').default(1).notNull(),
parserError: text('parser_error'),
processingStatus: text('processing_status').default('pending').notNull(), processingStatus: text('processing_status').default('pending').notNull(),
ingestedAt: timestamp('ingested_at', { withTimezone: true }).defaultNow().notNull() ingestedAt: timestamp('ingested_at', { withTimezone: true }).defaultNow().notNull()
}, },