mirror of
https://github.com/whekin/household-bot.git
synced 2026-03-31 13:54:02 +00:00
Merge pull request #11 from whekin/codex/whe-23-purchase-parser
feat(WHE-23): implement hybrid purchase parser (rules + LLM fallback)
This commit is contained in:
@@ -10,6 +10,7 @@
|
|||||||
"lint": "oxlint \"src\""
|
"lint": "oxlint \"src\""
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@household/application": "workspace:*",
|
||||||
"@household/db": "workspace:*",
|
"@household/db": "workspace:*",
|
||||||
"drizzle-orm": "^0.44.7",
|
"drizzle-orm": "^0.44.7",
|
||||||
"grammy": "1.41.1"
|
"grammy": "1.41.1"
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ export interface BotRuntimeConfig {
|
|||||||
telegramHouseholdChatId?: string
|
telegramHouseholdChatId?: string
|
||||||
telegramPurchaseTopicId?: number
|
telegramPurchaseTopicId?: number
|
||||||
purchaseTopicIngestionEnabled: boolean
|
purchaseTopicIngestionEnabled: boolean
|
||||||
|
openaiApiKey?: string
|
||||||
|
parserModel: string
|
||||||
}
|
}
|
||||||
|
|
||||||
function parsePort(raw: string | undefined): number {
|
function parsePort(raw: string | undefined): number {
|
||||||
@@ -66,7 +68,8 @@ export function getBotRuntimeConfig(env: NodeJS.ProcessEnv = process.env): BotRu
|
|||||||
telegramBotToken: requireValue(env.TELEGRAM_BOT_TOKEN, 'TELEGRAM_BOT_TOKEN'),
|
telegramBotToken: requireValue(env.TELEGRAM_BOT_TOKEN, 'TELEGRAM_BOT_TOKEN'),
|
||||||
telegramWebhookSecret: requireValue(env.TELEGRAM_WEBHOOK_SECRET, 'TELEGRAM_WEBHOOK_SECRET'),
|
telegramWebhookSecret: requireValue(env.TELEGRAM_WEBHOOK_SECRET, 'TELEGRAM_WEBHOOK_SECRET'),
|
||||||
telegramWebhookPath: env.TELEGRAM_WEBHOOK_PATH ?? '/webhook/telegram',
|
telegramWebhookPath: env.TELEGRAM_WEBHOOK_PATH ?? '/webhook/telegram',
|
||||||
purchaseTopicIngestionEnabled
|
purchaseTopicIngestionEnabled,
|
||||||
|
parserModel: env.PARSER_MODEL?.trim() || 'gpt-4.1-mini'
|
||||||
}
|
}
|
||||||
|
|
||||||
if (databaseUrl !== undefined) {
|
if (databaseUrl !== undefined) {
|
||||||
@@ -81,6 +84,10 @@ export function getBotRuntimeConfig(env: NodeJS.ProcessEnv = process.env): BotRu
|
|||||||
if (telegramPurchaseTopicId !== undefined) {
|
if (telegramPurchaseTopicId !== undefined) {
|
||||||
runtime.telegramPurchaseTopicId = telegramPurchaseTopicId
|
runtime.telegramPurchaseTopicId = telegramPurchaseTopicId
|
||||||
}
|
}
|
||||||
|
const openaiApiKey = parseOptionalValue(env.OPENAI_API_KEY)
|
||||||
|
if (openaiApiKey !== undefined) {
|
||||||
|
runtime.openaiApiKey = openaiApiKey
|
||||||
|
}
|
||||||
|
|
||||||
return runtime
|
return runtime
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import { webhookCallback } from 'grammy'
|
|||||||
|
|
||||||
import { createTelegramBot } from './bot'
|
import { createTelegramBot } from './bot'
|
||||||
import { getBotRuntimeConfig } from './config'
|
import { getBotRuntimeConfig } from './config'
|
||||||
|
import { createOpenAiParserFallback } from './openai-parser-fallback'
|
||||||
import {
|
import {
|
||||||
createPurchaseMessageRepository,
|
createPurchaseMessageRepository,
|
||||||
registerPurchaseTopicIngestion
|
registerPurchaseTopicIngestion
|
||||||
@@ -17,6 +18,7 @@ let closePurchaseRepository: (() => Promise<void>) | undefined
|
|||||||
if (runtime.purchaseTopicIngestionEnabled) {
|
if (runtime.purchaseTopicIngestionEnabled) {
|
||||||
const purchaseRepositoryClient = createPurchaseMessageRepository(runtime.databaseUrl!)
|
const purchaseRepositoryClient = createPurchaseMessageRepository(runtime.databaseUrl!)
|
||||||
closePurchaseRepository = purchaseRepositoryClient.close
|
closePurchaseRepository = purchaseRepositoryClient.close
|
||||||
|
const llmFallback = createOpenAiParserFallback(runtime.openaiApiKey, runtime.parserModel)
|
||||||
|
|
||||||
registerPurchaseTopicIngestion(
|
registerPurchaseTopicIngestion(
|
||||||
bot,
|
bot,
|
||||||
@@ -25,7 +27,12 @@ if (runtime.purchaseTopicIngestionEnabled) {
|
|||||||
householdChatId: runtime.telegramHouseholdChatId!,
|
householdChatId: runtime.telegramHouseholdChatId!,
|
||||||
purchaseTopicId: runtime.telegramPurchaseTopicId!
|
purchaseTopicId: runtime.telegramPurchaseTopicId!
|
||||||
},
|
},
|
||||||
purchaseRepositoryClient.repository
|
purchaseRepositoryClient.repository,
|
||||||
|
llmFallback
|
||||||
|
? {
|
||||||
|
llmFallback
|
||||||
|
}
|
||||||
|
: {}
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
console.warn(
|
console.warn(
|
||||||
|
|||||||
119
apps/bot/src/openai-parser-fallback.ts
Normal file
119
apps/bot/src/openai-parser-fallback.ts
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
import type { PurchaseParserLlmFallback } from '@household/application'
|
||||||
|
|
||||||
|
interface OpenAiStructuredResult {
|
||||||
|
amountMinor: string
|
||||||
|
currency: 'GEL' | 'USD'
|
||||||
|
itemDescription: string
|
||||||
|
confidence: number
|
||||||
|
needsReview: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
function asBigInt(value: string): bigint | null {
|
||||||
|
if (!/^[0-9]+$/.test(value)) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsed = BigInt(value)
|
||||||
|
return parsed > 0n ? parsed : null
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createOpenAiParserFallback(
|
||||||
|
apiKey: string | undefined,
|
||||||
|
model: string
|
||||||
|
): PurchaseParserLlmFallback | undefined {
|
||||||
|
if (!apiKey) {
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
return async (rawText: string) => {
|
||||||
|
const response = await fetch('https://api.openai.com/v1/responses', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
authorization: `Bearer ${apiKey}`,
|
||||||
|
'content-type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model,
|
||||||
|
input: [
|
||||||
|
{
|
||||||
|
role: 'system',
|
||||||
|
content:
|
||||||
|
'Extract a shared household purchase from text. Return only valid JSON with amountMinor, currency, itemDescription, confidence, needsReview.'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: rawText
|
||||||
|
}
|
||||||
|
],
|
||||||
|
text: {
|
||||||
|
format: {
|
||||||
|
type: 'json_schema',
|
||||||
|
name: 'purchase_parse',
|
||||||
|
schema: {
|
||||||
|
type: 'object',
|
||||||
|
additionalProperties: false,
|
||||||
|
properties: {
|
||||||
|
amountMinor: {
|
||||||
|
type: 'string'
|
||||||
|
},
|
||||||
|
currency: {
|
||||||
|
type: 'string',
|
||||||
|
enum: ['GEL', 'USD']
|
||||||
|
},
|
||||||
|
itemDescription: {
|
||||||
|
type: 'string'
|
||||||
|
},
|
||||||
|
confidence: {
|
||||||
|
type: 'number',
|
||||||
|
minimum: 0,
|
||||||
|
maximum: 100
|
||||||
|
},
|
||||||
|
needsReview: {
|
||||||
|
type: 'boolean'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
required: ['amountMinor', 'currency', 'itemDescription', 'confidence', 'needsReview']
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = (await response.json()) as {
|
||||||
|
output_text?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!payload.output_text) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsedJson: OpenAiStructuredResult
|
||||||
|
try {
|
||||||
|
parsedJson = JSON.parse(payload.output_text) as OpenAiStructuredResult
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const amountMinor = asBigInt(parsedJson.amountMinor)
|
||||||
|
if (!amountMinor) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parsedJson.itemDescription.trim().length === 0) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
amountMinor,
|
||||||
|
currency: parsedJson.currency,
|
||||||
|
itemDescription: parsedJson.itemDescription,
|
||||||
|
confidence: Math.max(0, Math.min(100, Math.round(parsedJson.confidence))),
|
||||||
|
parserMode: 'llm',
|
||||||
|
needsReview: parsedJson.needsReview
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import { parsePurchaseMessage, type PurchaseParserLlmFallback } from '@household/application'
|
||||||
import { and, eq } from 'drizzle-orm'
|
import { and, eq } from 'drizzle-orm'
|
||||||
import type { Bot, Context } from 'grammy'
|
import type { Bot, Context } from 'grammy'
|
||||||
|
|
||||||
@@ -25,7 +26,10 @@ export interface PurchaseTopicRecord extends PurchaseTopicCandidate {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface PurchaseMessageIngestionRepository {
|
export interface PurchaseMessageIngestionRepository {
|
||||||
save(record: PurchaseTopicRecord): Promise<'created' | 'duplicate'>
|
save(
|
||||||
|
record: PurchaseTopicRecord,
|
||||||
|
llmFallback?: PurchaseParserLlmFallback
|
||||||
|
): Promise<'created' | 'duplicate'>
|
||||||
}
|
}
|
||||||
|
|
||||||
export function extractPurchaseTopicCandidate(
|
export function extractPurchaseTopicCandidate(
|
||||||
@@ -52,6 +56,10 @@ export function extractPurchaseTopicCandidate(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function needsReviewAsInt(value: boolean): number {
|
||||||
|
return value ? 1 : 0
|
||||||
|
}
|
||||||
|
|
||||||
export function createPurchaseMessageRepository(databaseUrl: string): {
|
export function createPurchaseMessageRepository(databaseUrl: string): {
|
||||||
repository: PurchaseMessageIngestionRepository
|
repository: PurchaseMessageIngestionRepository
|
||||||
close: () => Promise<void>
|
close: () => Promise<void>
|
||||||
@@ -62,7 +70,7 @@ export function createPurchaseMessageRepository(databaseUrl: string): {
|
|||||||
})
|
})
|
||||||
|
|
||||||
const repository: PurchaseMessageIngestionRepository = {
|
const repository: PurchaseMessageIngestionRepository = {
|
||||||
async save(record) {
|
async save(record, llmFallback) {
|
||||||
const matchedMember = await db
|
const matchedMember = await db
|
||||||
.select({ id: schema.members.id })
|
.select({ id: schema.members.id })
|
||||||
.from(schema.members)
|
.from(schema.members)
|
||||||
@@ -75,6 +83,30 @@ export function createPurchaseMessageRepository(databaseUrl: string): {
|
|||||||
.limit(1)
|
.limit(1)
|
||||||
|
|
||||||
const senderMemberId = matchedMember[0]?.id ?? null
|
const senderMemberId = matchedMember[0]?.id ?? null
|
||||||
|
let parserError: string | null = null
|
||||||
|
|
||||||
|
const parsed = await parsePurchaseMessage(
|
||||||
|
{
|
||||||
|
rawText: record.rawText
|
||||||
|
},
|
||||||
|
llmFallback
|
||||||
|
? {
|
||||||
|
llmFallback
|
||||||
|
}
|
||||||
|
: {}
|
||||||
|
).catch((error) => {
|
||||||
|
parserError = error instanceof Error ? error.message : 'Unknown parser error'
|
||||||
|
return null
|
||||||
|
})
|
||||||
|
|
||||||
|
const processingStatus =
|
||||||
|
parserError !== null
|
||||||
|
? 'parse_failed'
|
||||||
|
: parsed === null
|
||||||
|
? 'needs_review'
|
||||||
|
: parsed.needsReview
|
||||||
|
? 'needs_review'
|
||||||
|
: 'parsed'
|
||||||
|
|
||||||
const inserted = await db
|
const inserted = await db
|
||||||
.insert(schema.purchaseMessages)
|
.insert(schema.purchaseMessages)
|
||||||
@@ -89,7 +121,14 @@ export function createPurchaseMessageRepository(databaseUrl: string): {
|
|||||||
telegramThreadId: record.threadId,
|
telegramThreadId: record.threadId,
|
||||||
telegramUpdateId: String(record.updateId),
|
telegramUpdateId: String(record.updateId),
|
||||||
messageSentAt: record.messageSentAt,
|
messageSentAt: record.messageSentAt,
|
||||||
processingStatus: 'pending'
|
parsedAmountMinor: parsed?.amountMinor,
|
||||||
|
parsedCurrency: parsed?.currency,
|
||||||
|
parsedItemDescription: parsed?.itemDescription,
|
||||||
|
parserMode: parsed?.parserMode,
|
||||||
|
parserConfidence: parsed?.confidence,
|
||||||
|
needsReview: needsReviewAsInt(parsed?.needsReview ?? true),
|
||||||
|
parserError,
|
||||||
|
processingStatus
|
||||||
})
|
})
|
||||||
.onConflictDoNothing({
|
.onConflictDoNothing({
|
||||||
target: [
|
target: [
|
||||||
@@ -151,7 +190,10 @@ function toCandidateFromContext(ctx: Context): PurchaseTopicCandidate | null {
|
|||||||
export function registerPurchaseTopicIngestion(
|
export function registerPurchaseTopicIngestion(
|
||||||
bot: Bot,
|
bot: Bot,
|
||||||
config: PurchaseTopicIngestionConfig,
|
config: PurchaseTopicIngestionConfig,
|
||||||
repository: PurchaseMessageIngestionRepository
|
repository: PurchaseMessageIngestionRepository,
|
||||||
|
options: {
|
||||||
|
llmFallback?: PurchaseParserLlmFallback
|
||||||
|
} = {}
|
||||||
): void {
|
): void {
|
||||||
bot.on('message:text', async (ctx) => {
|
bot.on('message:text', async (ctx) => {
|
||||||
const candidate = toCandidateFromContext(ctx)
|
const candidate = toCandidateFromContext(ctx)
|
||||||
@@ -165,7 +207,7 @@ export function registerPurchaseTopicIngestion(
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const status = await repository.save(record)
|
const status = await repository.save(record, options.llmFallback)
|
||||||
|
|
||||||
if (status === 'created') {
|
if (status === 'created') {
|
||||||
console.log(
|
console.log(
|
||||||
|
|||||||
1
bun.lock
1
bun.lock
@@ -15,6 +15,7 @@
|
|||||||
"apps/bot": {
|
"apps/bot": {
|
||||||
"name": "@household/bot",
|
"name": "@household/bot",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@household/application": "workspace:*",
|
||||||
"@household/db": "workspace:*",
|
"@household/db": "workspace:*",
|
||||||
"drizzle-orm": "^0.44.7",
|
"drizzle-orm": "^0.44.7",
|
||||||
"grammy": "1.41.1",
|
"grammy": "1.41.1",
|
||||||
|
|||||||
80
docs/specs/HOUSEBOT-022-hybrid-purchase-parser.md
Normal file
80
docs/specs/HOUSEBOT-022-hybrid-purchase-parser.md
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
# HOUSEBOT-022: Hybrid Purchase Parser
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Implement a rules-first purchase parser with optional LLM fallback for ambiguous Telegram purchase messages.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
- Parse common RU/EN purchase text with deterministic regex rules first.
|
||||||
|
- Call LLM fallback only when rules cannot safely resolve a single amount.
|
||||||
|
- Persist raw + parsed fields + confidence + parser mode.
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- Receipt OCR.
|
||||||
|
- Complex multi-item itemization.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- In: parser core logic, fallback interface, bot ingestion integration, DB fields for parser output.
|
||||||
|
- Out: settlement posting and command UIs.
|
||||||
|
|
||||||
|
## Interfaces and Contracts
|
||||||
|
|
||||||
|
- `parsePurchaseMessage({ rawText }, { llmFallback? })`
|
||||||
|
- Parser result fields:
|
||||||
|
- `amountMinor`
|
||||||
|
- `currency`
|
||||||
|
- `itemDescription`
|
||||||
|
- `confidence`
|
||||||
|
- `parserMode` (`rules` | `llm`)
|
||||||
|
- `needsReview`
|
||||||
|
|
||||||
|
## Domain Rules
|
||||||
|
|
||||||
|
- Rules parser attempts single-amount extraction first.
|
||||||
|
- Missing currency defaults to GEL and marks `needsReview=true`.
|
||||||
|
- Ambiguous text (multiple amounts) triggers LLM fallback if configured.
|
||||||
|
|
||||||
|
## Data Model Changes
|
||||||
|
|
||||||
|
- `purchase_messages` stores parsed fields:
|
||||||
|
- `parsed_amount_minor`
|
||||||
|
- `parsed_currency`
|
||||||
|
- `parsed_item_description`
|
||||||
|
- `parser_mode`
|
||||||
|
- `parser_confidence`
|
||||||
|
- `needs_review`
|
||||||
|
- `parser_error`
|
||||||
|
|
||||||
|
## Security and Privacy
|
||||||
|
|
||||||
|
- LLM fallback sends only minimal raw text needed for parsing.
|
||||||
|
- API key required for fallback path.
|
||||||
|
|
||||||
|
## Observability
|
||||||
|
|
||||||
|
- `processing_status` and `parser_error` capture parse outcomes.
|
||||||
|
|
||||||
|
## Edge Cases and Failure Modes
|
||||||
|
|
||||||
|
- Empty message text.
|
||||||
|
- Multiple numeric amounts.
|
||||||
|
- Invalid LLM output payload.
|
||||||
|
- Missing API key disables LLM fallback.
|
||||||
|
|
||||||
|
## Test Plan
|
||||||
|
|
||||||
|
- Unit tests for rules parser and fallback behavior.
|
||||||
|
- Ingestion tests for topic filter remain valid.
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
|
||||||
|
- [ ] Rules parser handles common message patterns.
|
||||||
|
- [ ] LLM fallback is invoked only when rules are insufficient.
|
||||||
|
- [ ] Parsed result + confidence + parser mode persisted.
|
||||||
|
|
||||||
|
## Rollout Plan
|
||||||
|
|
||||||
|
- Enable in dev group and monitor `needs_review` rate before stricter auto-accept rules.
|
||||||
@@ -1 +1,9 @@
|
|||||||
export { calculateMonthlySettlement } from './settlement-engine'
|
export { calculateMonthlySettlement } from './settlement-engine'
|
||||||
|
export {
|
||||||
|
parsePurchaseMessage,
|
||||||
|
type ParsedPurchaseResult,
|
||||||
|
type ParsePurchaseInput,
|
||||||
|
type ParsePurchaseOptions,
|
||||||
|
type PurchaseParserLlmFallback,
|
||||||
|
type PurchaseParserMode
|
||||||
|
} from './purchase-parser'
|
||||||
|
|||||||
63
packages/application/src/purchase-parser.test.ts
Normal file
63
packages/application/src/purchase-parser.test.ts
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
import { describe, expect, test } from 'bun:test'
|
||||||
|
|
||||||
|
import { parsePurchaseMessage } from './purchase-parser'
|
||||||
|
|
||||||
|
describe('parsePurchaseMessage', () => {
|
||||||
|
test('parses explicit currency with rules', async () => {
|
||||||
|
const result = await parsePurchaseMessage({
|
||||||
|
rawText: 'Купил туалетную бумагу 30 gel'
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(result).not.toBeNull()
|
||||||
|
expect(result?.amountMinor).toBe(3000n)
|
||||||
|
expect(result?.currency).toBe('GEL')
|
||||||
|
expect(result?.parserMode).toBe('rules')
|
||||||
|
expect(result?.needsReview).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('defaults to GEL when currency is omitted and marks review', async () => {
|
||||||
|
const result = await parsePurchaseMessage({
|
||||||
|
rawText: 'Bought soap 12.5'
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(result).not.toBeNull()
|
||||||
|
expect(result?.amountMinor).toBe(1250n)
|
||||||
|
expect(result?.currency).toBe('GEL')
|
||||||
|
expect(result?.needsReview).toBe(true)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('uses llm fallback for ambiguous message with multiple amounts', async () => {
|
||||||
|
const result = await parsePurchaseMessage(
|
||||||
|
{
|
||||||
|
rawText: 'Купил пасту 10 и мыло 5'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
llmFallback: async () => ({
|
||||||
|
amountMinor: 1500n,
|
||||||
|
currency: 'GEL',
|
||||||
|
itemDescription: 'паста и мыло',
|
||||||
|
confidence: 67,
|
||||||
|
parserMode: 'llm',
|
||||||
|
needsReview: true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(result).not.toBeNull()
|
||||||
|
expect(result?.parserMode).toBe('llm')
|
||||||
|
expect(result?.amountMinor).toBe(1500n)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns null when both rules and llm fail', async () => {
|
||||||
|
const result = await parsePurchaseMessage(
|
||||||
|
{
|
||||||
|
rawText: 'без суммы вообще'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
llmFallback: async () => null
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(result).toBeNull()
|
||||||
|
})
|
||||||
|
})
|
||||||
132
packages/application/src/purchase-parser.ts
Normal file
132
packages/application/src/purchase-parser.ts
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
export type PurchaseParserMode = 'rules' | 'llm'
|
||||||
|
|
||||||
|
export interface ParsedPurchaseResult {
|
||||||
|
amountMinor: bigint
|
||||||
|
currency: 'GEL' | 'USD'
|
||||||
|
itemDescription: string
|
||||||
|
confidence: number
|
||||||
|
parserMode: PurchaseParserMode
|
||||||
|
needsReview: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export type PurchaseParserLlmFallback = (rawText: string) => Promise<ParsedPurchaseResult | null>
|
||||||
|
|
||||||
|
export interface ParsePurchaseInput {
|
||||||
|
rawText: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ParsePurchaseOptions {
|
||||||
|
llmFallback?: PurchaseParserLlmFallback
|
||||||
|
}
|
||||||
|
|
||||||
|
const CURRENCY_PATTERN = '(?:₾|gel|lari|лари|usd|\\$|доллар(?:а|ов)?)'
|
||||||
|
const AMOUNT_WITH_OPTIONAL_CURRENCY = new RegExp(
|
||||||
|
`(?<amount>\\d+(?:[.,]\\d{1,2})?)\\s*(?<currency>${CURRENCY_PATTERN})?`,
|
||||||
|
'giu'
|
||||||
|
)
|
||||||
|
|
||||||
|
function normalizeCurrency(raw: string | undefined): 'GEL' | 'USD' | null {
|
||||||
|
if (!raw) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const value = raw.trim().toLowerCase()
|
||||||
|
if (value === '₾' || value === 'gel' || value === 'lari' || value === 'лари') {
|
||||||
|
return 'GEL'
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value === 'usd' || value === '$' || value.startsWith('доллар')) {
|
||||||
|
return 'USD'
|
||||||
|
}
|
||||||
|
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
function toMinorUnits(rawAmount: string): bigint {
|
||||||
|
const normalized = rawAmount.replace(',', '.')
|
||||||
|
const [wholePart, fractionalPart = ''] = normalized.split('.')
|
||||||
|
const cents = fractionalPart.padEnd(2, '0').slice(0, 2)
|
||||||
|
|
||||||
|
return BigInt(`${wholePart}${cents}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeDescription(rawText: string, matchedFragment: string): string {
|
||||||
|
const cleaned = rawText.replace(matchedFragment, ' ').replace(/\s+/g, ' ').trim()
|
||||||
|
|
||||||
|
if (cleaned.length === 0) {
|
||||||
|
return 'shared purchase'
|
||||||
|
}
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseWithRules(rawText: string): ParsedPurchaseResult | null {
|
||||||
|
const matches = Array.from(rawText.matchAll(AMOUNT_WITH_OPTIONAL_CURRENCY))
|
||||||
|
|
||||||
|
if (matches.length !== 1) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const [match] = matches
|
||||||
|
if (!match?.groups?.amount) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const currency = normalizeCurrency(match.groups.currency)
|
||||||
|
const amountMinor = toMinorUnits(match.groups.amount)
|
||||||
|
|
||||||
|
const explicitCurrency = currency !== null
|
||||||
|
const resolvedCurrency = currency ?? 'GEL'
|
||||||
|
const confidence = explicitCurrency ? 92 : 78
|
||||||
|
|
||||||
|
return {
|
||||||
|
amountMinor,
|
||||||
|
currency: resolvedCurrency,
|
||||||
|
itemDescription: normalizeDescription(rawText, match[0] ?? ''),
|
||||||
|
confidence,
|
||||||
|
parserMode: 'rules',
|
||||||
|
needsReview: !explicitCurrency
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function validateLlmResult(result: ParsedPurchaseResult | null): ParsedPurchaseResult | null {
|
||||||
|
if (!result) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.amountMinor <= 0n) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.confidence < 0 || result.confidence > 100) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.itemDescription.trim().length === 0) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function parsePurchaseMessage(
|
||||||
|
input: ParsePurchaseInput,
|
||||||
|
options: ParsePurchaseOptions = {}
|
||||||
|
): Promise<ParsedPurchaseResult | null> {
|
||||||
|
const rawText = input.rawText.trim()
|
||||||
|
if (rawText.length === 0) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const rulesResult = parseWithRules(rawText)
|
||||||
|
if (rulesResult) {
|
||||||
|
return rulesResult
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!options.llmFallback) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const llmResult = await options.llmFallback(rawText)
|
||||||
|
return validateLlmResult(llmResult)
|
||||||
|
}
|
||||||
7
packages/db/drizzle/0003_mature_roulette.sql
Normal file
7
packages/db/drizzle/0003_mature_roulette.sql
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "parsed_amount_minor" bigint;--> statement-breakpoint
|
||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "parsed_currency" text;--> statement-breakpoint
|
||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "parsed_item_description" text;--> statement-breakpoint
|
||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "parser_mode" text;--> statement-breakpoint
|
||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "parser_confidence" integer;--> statement-breakpoint
|
||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "needs_review" integer DEFAULT 1 NOT NULL;--> statement-breakpoint
|
||||||
|
ALTER TABLE "purchase_messages" ADD COLUMN "parser_error" text;
|
||||||
1393
packages/db/drizzle/meta/0003_snapshot.json
Normal file
1393
packages/db/drizzle/meta/0003_snapshot.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -22,6 +22,13 @@
|
|||||||
"when": 1772670548136,
|
"when": 1772670548136,
|
||||||
"tag": "0002_tough_sandman",
|
"tag": "0002_tough_sandman",
|
||||||
"breakpoints": true
|
"breakpoints": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 3,
|
||||||
|
"version": "7",
|
||||||
|
"when": 1772671128084,
|
||||||
|
"tag": "0003_mature_roulette",
|
||||||
|
"breakpoints": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -198,6 +198,13 @@ export const purchaseMessages = pgTable(
|
|||||||
telegramThreadId: text('telegram_thread_id').notNull(),
|
telegramThreadId: text('telegram_thread_id').notNull(),
|
||||||
telegramUpdateId: text('telegram_update_id').notNull(),
|
telegramUpdateId: text('telegram_update_id').notNull(),
|
||||||
messageSentAt: timestamp('message_sent_at', { withTimezone: true }),
|
messageSentAt: timestamp('message_sent_at', { withTimezone: true }),
|
||||||
|
parsedAmountMinor: bigint('parsed_amount_minor', { mode: 'bigint' }),
|
||||||
|
parsedCurrency: text('parsed_currency'),
|
||||||
|
parsedItemDescription: text('parsed_item_description'),
|
||||||
|
parserMode: text('parser_mode'),
|
||||||
|
parserConfidence: integer('parser_confidence'),
|
||||||
|
needsReview: integer('needs_review').default(1).notNull(),
|
||||||
|
parserError: text('parser_error'),
|
||||||
processingStatus: text('processing_status').default('pending').notNull(),
|
processingStatus: text('processing_status').default('pending').notNull(),
|
||||||
ingestedAt: timestamp('ingested_at', { withTimezone: true }).defaultNow().notNull()
|
ingestedAt: timestamp('ingested_at', { withTimezone: true }).defaultNow().notNull()
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user