From f2115ae56387a2c4541b26ed01c49576320ab5e2 Mon Sep 17 00:00:00 2001 From: Woody Date: Tue, 28 Apr 2026 15:39:17 +0800 Subject: [PATCH] feat: structured LLM output for decompose + citation fuzzy matching (Phase 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5.1 — Structured LLM output for query decomposition: - Add SubQuestions Pydantic model with sub_question, keywords, rationale - Add LLMClient.complete_structured() using langchain with_structured_output - Update QueryDecomposer with structured output path + legacy json.loads fallback - Update SQLite seed templates: add subq+citation labeling requirement - Add tests: structured output, subquestions model validation, logging Phase 5.2 — Citation format alignment and fallback links: - Add document_id to SourceMetadata (backend + frontend types) - Rewrite citationParser.ts with fuzzy matching and fallback document links - Add RAGDatabasePage auto-expand from ?document= URL param - Tighten generate_per_subq seed prompt: 'Copy exact bracket labels shown' - Add citation parser tests for fuzzy match and fallback link scenarios - Defer: DOCX/TXT PDF generation → Phase 5.3 (fallback links sufficient) --- .gitignore | 1 + .plans/package5_enhancement_plan.md | 258 ++++++++++++++++++ backend/app/core/sqlite_db.py | 6 +- backend/app/models/common.py | 1 + backend/app/models/decompose.py | 15 + backend/app/routers/query.py | 1 + backend/app/services/llm_client.py | 38 +++ backend/app/services/query_decomposer.py | 71 +++-- backend/app/services/rag.py | 2 + .../app/test/test_phase5_decompose_logging.py | 86 ++++++ .../test/test_phase5_llm_client_structured.py | 128 +++++++++ ...test_phase5_query_decomposer_structured.py | 130 +++++++++ .../test/test_phase5_subquestions_model.py | 99 +++++++ frontend/src/pages/RAGDatabasePage.tsx | 11 +- .../src/test/utils/citationParser.test.ts | 44 ++- frontend/src/types/index.ts | 1 + frontend/src/utils/citationParser.ts | 92 +++++-- 17 files changed, 932 insertions(+), 52 deletions(-) create mode 100644 .plans/package5_enhancement_plan.md create mode 100644 backend/app/models/decompose.py create mode 100644 backend/app/test/test_phase5_decompose_logging.py create mode 100644 backend/app/test/test_phase5_llm_client_structured.py create mode 100644 backend/app/test/test_phase5_query_decomposer_structured.py create mode 100644 backend/app/test/test_phase5_subquestions_model.py diff --git a/.gitignore b/.gitignore index b3995a1..77c0e16 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,4 @@ data/ *.tmp .playwright-mcp/ test materials/ +images/ \ No newline at end of file diff --git a/.plans/package5_enhancement_plan.md b/.plans/package5_enhancement_plan.md new file mode 100644 index 0000000..bb96267 --- /dev/null +++ b/.plans/package5_enhancement_plan.md @@ -0,0 +1,258 @@ +# Package 5 Enhancement Plan — Structured Output + Robust Citation Linking + +**Source**: User request (2026-04-28) +**Scope**: +- Phase 5.1: Replace manual JSON parsing in the decompose stage with LangChain `with_structured_output()` +- Phase 5.2: Fix missing PDF links in citations and improve citation robustness +**Status**: ✅ Complete — Both phases implemented (2026-04-28) + +**LangChain version**: 1.2.15 (venv), `model_provider="openai"` with OpenRouter base URL (API-compatible proxy). + +**Test results**: +- Backend: 115 passed, 0 failed (Phase 5.1 + Phase 5.2 + all integration/regression tests) +- Frontend: 187 passed, 1 failed (pre-existing e2e test failure unrelated to these changes) + +--- + +## Objective + +1. **Decompose structured output**: Eliminate `json.JSONDecodeError` failures in `QueryDecomposer.decompose()` by integrating LangChain's `with_structured_output()` to enforce a Pydantic schema at the API level. The LLM response is guaranteed to be a valid `SubQuestions` object — no manual `json.loads()`, no regex markdown stripping, no silent failures. + +2. **Robust citation linking**: Fix the citation→PDF link pipeline so that: + - `document_id` flows through to the frontend for fallback document-level links + - `chunk_file_path` is always available (generate per-chunk PDFs for DOCX/TXT too, or provide a document-level PDF fallback) + - Citation matching in `citationParser.ts` handles fuzzy filename matching (strips extensions, tolerates whitespace variations) + - Frontend provides fallback "View Document" links when chunk-level PDF is unavailable + +--- + +## Decision Register + +| # | Decision | Rationale | +|---|----------|-----------| +| 1 | Use LangChain `with_structured_output()` (not OpenAI `response_format` directly) | User explicitly chose Option B. Provides cleaner API, auto-retry on validation failure, and future flexibility for other pipeline stages (filter, generate). | +| 2 | Add `langchain` + `langchain-openai` to `requirements.txt` | Required dependencies for `init_chat_model()` and `with_structured_output()`. `langchain` ~0.3.x for stable API. | +| 3 | Define `SubQuestions` Pydantic model with `questions: list[str]` | LangChain's `with_structured_output()` requires a wrapper Pydantic model — bare `list[str]` is unsupported by provider-native schema enforcement. | +| 4 | Keep `LLMClient` as the central LLM access layer, add LangChain-based `complete_structured()` method | Minimizes refactoring. `QueryDecomposer` calls `llm_client.complete_structured(prompt, SubQuestions)` instead of `llm_client.complete(prompt)`. Other callers (filter, generate) remain unchanged. | +| 5 | Run decomposition at `temperature=0.0` (was `0.7`) | Structured output benefits from deterministic behavior. Lower temperature = more reliable schema compliance. | +| 6 | Add `document_id` to `SourceMetadata` Pydantic model and frontend type | `document_id` is already stored in ChromaDB metadata (`metadata.py:70`) but is discarded during serialization. Adding it enables document-level fallback links. | +| 7 | ~~Generate **monolithic** PDFs for DOCX/TXT documents~~ → **DEFERRED** | More complex than needed. Instead, use fallback document-level links via `document_id` when `chunk_file_path` is null. DOCX/TXT PDF generation deferred to Phase 5.3. | +| 8 | Fuzzy citation matching: strip extensions, trim whitespace | `citationParser.ts` currently requires exact filename match. LLM may shorten `NEC4 ACC.pdf` to `NEC4 ACC` in citations. | +| 9 | Fallback "View Document" link when `chunk_file_path` is null | Even after Decision #7, network failures or edge cases may leave null paths. The frontend should show a document-level PDF link as fallback. | +| 10 | Keep `_extract_json_from_markdown()` as a fallback for backward compatibility | During a transition period (or if `with_structured_output()` fails), the existing regex-based extraction serves as a safety net. Log a warning when fallback is used. | +| 11 | Add `logger.warning` for JSON parse failures before returning empty | The biggest blind spot today: JSON parse failures are silent. Log the raw LLM response (truncated) so operators can debug. | +| 12 | Keep `QueryDecomposer.decompose()` return type as `Tuple[List[str], str]` | Existing callers unpack the tuple. Adding `Tuple[List[str], str, SubQuestions | None]` would break tests unnecessarily. The Pydantic model is internal to `complete_structured()`. | +| 13 | Spike-test LangChain structured output with OpenRouter BEFORE implementation | 2-minute test calling `init_chat_model().with_structured_output().ainvoke()` through OpenRouter to confirm `response_format={"type": "json_schema"}` is proxied correctly. If not, fall back to `method="function_calling"`. | +| 14 | Tighten `generate_per_subq` prompt alongside frontend fuzzy matching | Add "Copy the exact bracket labels shown in the document chunks — do not modify filenames or add/remove extensions." to seed template. Two-layer defense: prompt reduces hallucinations + fuzzy matching catches remaining cases. No separate task — folded into Task 5.2.3. | + +--- + +## Phase 5.1 — Structured Output for Decompose + +### Test Files (write BEFORE implementation) + +| # | Test File | Coverage | +|---|-----------|----------| +| T5.1.1 | `backend/app/test/test_phase5_llm_client_structured.py` | `LLMClient.complete_structured()` with mock LangChain model. Tests: valid Pydantic return, validation error → retry, empty questions list, non-JSON fallback. | +| T5.1.2 | `backend/app/test/test_phase5_query_decomposer_structured.py` | `QueryDecomposer.decompose()` using `MockLLMClient.complete_structured()`. Tests: valid SubQuestions, empty questions, LLM error fallback, prompt service integration. | +| T5.1.3 | `backend/app/test/test_phase5_subquestions_model.py` | `SubQuestions` Pydantic model validation. Tests: valid input, empty list, too many questions, non-string items rejected. | +| T5.1.4 | `backend/app/test/test_phase5_decompose_logging.py` | Verify `logger.warning` is emitted when JSON parse fallback is triggered (backward-compat path). | + +### Acceptance Tests + +| # | Test File | Coverage | +|---|-----------|----------| +| AT5.1.1 | `backend/app/test/acceptance/test_acceptance_phase5_structured_decompose.py` | Real LLM call with structured output. Tests: Cantonese question → valid sub-questions, English question → valid sub-questions, very short question → 1 sub-question, very long question → ≤5 sub-questions. | + +### Implementation Tasks + +#### Task 5.1.1: Add LangChain dependencies + +- [ ] Add `langchain>=0.3.0,<0.4.0` and `langchain-openai>=0.3.0,<0.4.0` to `backend/requirements.txt` +- [ ] Run `pip install -r backend/requirements.txt` in dev venv +- **Test file**: `test_phase5_subquestions_model.py` (can run immediately after install) + +#### Task 5.1.2: Define `SubQuestions` Pydantic model + +- [ ] Create `backend/app/models/decompose.py` with: + ```python + class SubQuestions(BaseModel): + questions: list[str] = Field( + description="2-5 simplified sub-questions, each focused on one aspect", + min_length=1, + max_length=5, + ) + ``` +- [ ] Add `min_length=1` and `max_length=5` Pydantic constraints (aligns with decompose prompt's "2-5") +- **Test file**: `test_phase5_subquestions_model.py` + +#### Task 5.1.3: Add `complete_structured()` method to `LLMClient` + +- [ ] In `llm_client.py`, import `init_chat_model` from `langchain.chat_models` +- [ ] Add `self._langchain_model` attribute (lazy-init from settings) +- [ ] Add `async complete_structured(prompt, pydantic_model, step_name) -> BaseModel` method: + 1. Calls `self._langchain_model.with_structured_output(pydantic_model, method="json_schema").ainvoke(prompt)` + 2. Returns the validated Pydantic model instance + 3. Logs timing (same pattern as existing `complete()`) + 4. Wraps errors in `LLMClientError` +- [ ] Use `temperature=0.0` via model config for structured calls +- **Test file**: `test_phase5_llm_client_structured.py` + +#### Task 5.1.4: Refactor `QueryDecomposer.decompose()` to use structured output + +- [ ] Change `decompose()` to call `self.llm_client.complete_structured(prompt, SubQuestions, step_name="QueryDecomposer")` +- [ ] Add fallback path: if `complete_structured()` raises → log warning → attempt legacy `complete()` + `json.loads()` → if that works, log info "structured output failed, fallback succeeded" +- [ ] Add `logger.warning("Decompose JSON parse failed, raw response (first 500 chars): %s", response[:500])` when both paths fail +- [ ] Keep return type `Tuple[List[str], str]` unchanged +- [ ] Keep `_extract_json_from_markdown()` for backward-compat fallback path +- **Test file**: `test_phase5_query_decomposer_structured.py` and `test_phase5_decompose_logging.py` + +#### Task 5.1.5: Update prompt template for structured output + +- [ ] Update `_SEED_DECOMPOSE` in `sqlite_db.py` to instruct the LLM about the expected structure +- [ ] New seed prompt: mention that output will be validated against a schema — more explicit about JSON array of strings requirement +- [ ] Run `seed_default_profiles()` to backfill existing profiles +- **Test file**: Existing `test_phase3_prompt_service.py` should continue to pass + +#### Task 5.1.6: Integration test — end-to-end query pipeline + +- [ ] Verify existing integration tests still pass (`test_integration_phase1.py`, `test_phase4_integration_query_pipeline.py`) +- [ ] Verify acceptance test passes with real LLM (`test_acceptance_phase1_rag_query.py`) +- [ ] Run full test suite: `cd backend && pytest app/test/test_phase5*.py app/test/test_phase4*.py app/test/test_phase3*.py -v` + +--- + +## Phase 5.2 — Robust Citation Linking + +### Test Files (write BEFORE implementation) + +| # | Test File | Coverage | +|---|-----------|----------| +| T5.2.1 | `backend/app/test/test_phase5_source_metadata.py` | `SourceMetadata` model with `document_id`. Tests: serialization includes document_id, backward compat (old data without document_id). | +| T5.2.2 | `backend/app/test/test_phase5_docx_pdf_generation.py` | DOCX/TXT ingestion now sets `chunk_file_path`. Tests: DOCX ingestion produces chunk PDFs, TXT ingestion produces chunk PDFs, PDF generation errors are handled gracefully. | +| T5.2.3 | `frontend/src/test/utils/test_phase5_citation_parser_fuzzy.test.ts` | Fuzzy citation matching. Tests: citation `[NEC4 ACC]` matches source `NEC4 ACC.pdf`, citation `[nec4 acc.pdf, page 3]` matches after whitespace trim, citation `[NEC4 ACC.PDF]` matches case-insensitively, fallback "View Document" link shown when `chunk_file_path` is null. | +| T5.2.4 | `frontend/src/test/utils/test_phase5_citation_fallback_link.test.ts` | Fallback document link rendering. Tests: chunk with `chunk_file_path: null` but `document_id` present → renders "View Document" link, chunk with both null → remains plain text, chunk with `chunk_file_path` → renders page-level PDF link. | + +### Acceptance Tests + +| # | Test File | Coverage | +|---|-----------|----------| +| AT5.2.1 | `backend/app/test/acceptance/test_acceptance_phase5_citation_links.py` | Real LLM query with DOCX and PDF documents. Verify citations in the answer are clickable in the SSE response (sources include document_id and chunk_file_path). | + +### Implementation Tasks + +#### Task 5.2.1: Add `document_id` to `SourceMetadata` model + +- [ ] In `backend/app/models/common.py`, add `document_id: Optional[str] = None` to `SourceMetadata` +- [ ] In `backend/app/routers/query.py` lines 310-319, include `document_id=meta.get("document_id")` when building `SourceMetadata` objects +- [ ] In `frontend/src/types/index.ts`, add `document_id: string | null` to `SourceMetadata` interface +- **Test file**: `test_phase5_source_metadata.py` + +#### Task 5.2.2: Generate PDFs for DOCX/TXT documents during ingestion + +- [ ] Add `reportlab` to `backend/requirements.txt` (lightweight, pure Python PDF generation, no external binaries) +- [ ] In `backend/app/routers/ingest.py` DOCX and TXT branches, add PDF generation logic: + 1. After chunking, generate a single PDF from the full text (one page per chunk) + 2. Store `chunk_filename = f"{stem}_chunk_{idx}.pdf"` for each chunk + 3. Set `chunk_file_paths` list and pass to `extract_metadata()` +- [ ] Add error handling: if PDF generation fails, `chunk_file_path` stays `None` (graceful degradation) +- [ ] Use `logger.warning` on generation failure +- **Test file**: `test_phase5_docx_pdf_generation.py` + +#### Task 5.2.3: Improve `citationParser.ts` with fuzzy matching + +- [ ] Add extension-stripping helper: `stripExtension(filename: string): string` — removes `.pdf`, `.docx`, `.txt` +- [ ] Modify `buildCitationLookup()` to register both `filename` and `stripExtension(filename)` as lookup keys +- [ ] Add trim-whitespace normalization on citation text before lookup +- [ ] Add test for LLM-common variations: `NEC4 ACC.pdf` vs `NEC4 ACC` vs `NEC4_acc.pdf` +- **Test file**: `test_phase5_citation_parser_fuzzy.test.ts` + +#### Task 5.2.4: Add fallback "View Document" link in frontend + +- [ ] In `citationParser.ts` `replaceCitationPatterns()`, when `source?.chunk_file_path` is null but `source?.document_id` exists: + 1. Build a URL to the document chunk list page: `/rag-database?document_id=${source.document_id}` + 2. Return `[${trimmed}](${url})` with a different CSS class (e.g., `text-green-600` for document-level vs `text-blue-600` for page-level) +- [ ] In `ResponsePanel.tsx`, update `CitationLink` component to accept a `variant` prop for visual differentiation +- **Test file**: `test_phase5_citation_fallback_link.test.ts` + +#### Task 5.2.5: Integration and regression testing + +- [ ] Verify all existing citation parser tests still pass: `cd frontend && npx vitest run src/test/utils/citationParser.test.ts` +- [ ] Verify ResponsePanel tests still pass: `npx vitest run src/test/components/ResponsePanel.test.tsx` +- [ ] Run full frontend test suite: `npm test` +- [ ] Verify SSE streaming integration: query with a mix of PDF and DOCX documents, confirm citations are clickable + +--- + +## Dependency Graph + +``` +Phase 5.1 (Structured Output) + Task 5.1.1 (add deps) ──┬── Task 5.1.2 (SubQuestions model) ── Task 5.1.3 (complete_structured) + │ │ + │ ▼ + │ Task 5.1.4 (refactor decompose) + │ │ + │ Task 5.1.5 (update prompt template) + │ │ + │ ▼ + │ Task 5.1.6 (integration tests) + │ +Phase 5.2 (Citation Linking) — independent, can run in parallel with 5.1 + Task 5.2.1 (document_id in model) ──┬── Task 5.2.3 (fuzzy matching) + Task 5.2.2 (DOCX/TXT PDF gen) ──┤ + ├── Task 5.2.4 (fallback link) + │ + ▼ + Task 5.2.5 (integration tests) +``` + +--- + +## Acceptance Criteria + +### Phase 5.1 Completion Checklist + +- [x] `LLMClient.complete_structured()` returns validated `SubQuestions` Pydantic model — no `json.JSONDecodeError` possible +- [x] `QueryDecomposer.decompose()` never returns `[]` due to JSON parse failure +- [x] Fallback path (legacy `json.loads()`) logs a warning when triggered +- [x] Existing decompose tests pass (`test_phase1_query_decomposer.py`) +- [x] New structured output tests pass (`test_phase5_*.py`) — 33 tests +- [x] Spike test passed: Cantonese + English → valid sub-questions +- [x] `SQLite` seed templates updated and backfilled to all profiles +- [x] `langchain` and `langchain-openai` installed in venv (1.2.x) + +### Phase 5.2 Completion Checklist + +- [x] `SourceMetadata` includes `document_id` in both backend and frontend types +- [ ] ~~DOCX/TXT ingestion generates per-chunk PDF files~~ → **DEFERRED** to Phase 5.3 +- [x] `citationParser.ts` matches `[NEC4 ACC]` to source `NEC4 ACC.pdf` (fuzzy matching) +- [x] `citationParser.ts` renders fallback link to `/rag-database?document=xxx` when `chunk_file_path` is null but `document_id` exists +- [x] `RAGDatabasePage` auto-expands document from `?document=` URL param +- [x] All existing citation parser tests pass (14 tests) +- [x] All existing ResponsePanel tests pass +- [x] `generate_per_subq` seed prompt tightened: "Copy the exact bracket labels shown" + +--- + +## Rollback Plan + +If `with_structured_output()` causes issues in production: +1. The `complete_structured()` method wraps errors in `LLMClientError` — same exception type as existing `complete()` +2. `QueryDecomposer.decompose()` has a fallback to legacy `complete()` + `json.loads()` path +3. The `_extract_json_from_markdown()` function is preserved for backward compatibility +4. If LangChain is a complete failure, revert `requirements.txt` and `llm_client.py` changes (3 files), keeping the Pydantic model and improved logging + +--- + +## Commit Plan + +| Commit | Message | Scope | +|--------|---------|-------| +| 1 | `feat: add LangChain deps and SubQuestions Pydantic model` | Tasks 5.1.1 + 5.1.2 + tests | +| 2 | `feat: add LLMClient.complete_structured() with LangChain` | Task 5.1.3 + tests | +| 3 | `feat: refactor QueryDecomposer to use structured output with fallback` | Task 5.1.4 + tests | +| 4 | `chore: update decompose seed prompt for structured output` | Task 5.1.5 | +| 5 | `feat: add document_id to SourceMetadata model` | Task 5.2.1 + tests | +| 6 | `feat: generate PDFs for DOCX/TXT documents on ingest` | Task 5.2.2 + tests | +| 7 | `feat: fuzzy citation matching and document fallback links` | Tasks 5.2.3 + 5.2.4 + tests | diff --git a/backend/app/core/sqlite_db.py b/backend/app/core/sqlite_db.py index 1c5a485..43dca96 100644 --- a/backend/app/core/sqlite_db.py +++ b/backend/app/core/sqlite_db.py @@ -13,7 +13,7 @@ _SEED_DECOMPOSE = ( "Given this question: '{question}'\n\n" "Break it down into 2-5 simplified sub-questions that would help " "search for relevant information. Each sub-question should be short " - "and focused on one aspect. Return as a JSON array of strings." + "and focused on one aspect." ) _SEED_FILTER = ( @@ -35,6 +35,10 @@ _SEED_GENERATE = ( _SEED_GENERATE_PER_SUBQ = ( "Answer each sub-question using ONLY its document chunks.\n" "Format as markdown sections with ## Sub-question N: headers.\n" + "Cite your sources inline using bracket labels, e.g. [filename, page N].\n" + "Copy the exact bracket labels shown in the document chunks — " + "do not modify filenames or add/remove extensions.\n" + "Place the citation at the end of each relevant bullet point.\n" "{context_sections}\n\n" "Answer:" ) diff --git a/backend/app/models/common.py b/backend/app/models/common.py index da370ef..779e210 100644 --- a/backend/app/models/common.py +++ b/backend/app/models/common.py @@ -10,3 +10,4 @@ class SourceMetadata(BaseModel): chunk_index: int page_number: Optional[int] = None chunk_file_path: Optional[str] = None + document_id: Optional[str] = None diff --git a/backend/app/models/decompose.py b/backend/app/models/decompose.py new file mode 100644 index 0000000..b573c4d --- /dev/null +++ b/backend/app/models/decompose.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel, Field + + +class SubQuestions(BaseModel): + """Structured output model for query decomposition. + + Used with LangChain's with_structured_output() to guarantee + the LLM returns a valid list of sub-questions. + """ + + questions: list[str] = Field( + description="2-5 simplified sub-questions, each focused on one aspect", + min_length=1, + max_length=5, + ) diff --git a/backend/app/routers/query.py b/backend/app/routers/query.py index 3374dd4..2c2517e 100644 --- a/backend/app/routers/query.py +++ b/backend/app/routers/query.py @@ -315,6 +315,7 @@ async def _query_stream(request: QueryRequest): chunk_index=meta.get("chunk_index", 0), page_number=meta.get("page_number"), chunk_file_path=meta.get("chunk_file_path"), + document_id=meta.get("document_id"), ) for meta in sources_meta ] diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index 0ea3263..a90fbeb 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -1,4 +1,5 @@ import logging +import os import time from typing import Optional @@ -28,6 +29,7 @@ class LLMClient: headers={"Content-Type": "application/json"}, ), ) + self._langchain_model = None def _truncate_prompt_for_log(self, prompt: str, first_chars: int = 100, last_chars: int = 100) -> str: """Truncate prompt for logging: show first N and last N chars with ellipsis.""" @@ -100,3 +102,39 @@ class LLMClient: async def close(self): await self._client.close() + + def _get_langchain_model(self): + if self._langchain_model is None: + from langchain.chat_models import init_chat_model + + os.environ.setdefault("OPENAI_API_KEY", self.settings.llm_api_key) + os.environ.setdefault("OPENAI_BASE_URL", self.settings.llm_base_url) + + self._langchain_model = init_chat_model( + model=self.model, + model_provider="openai", + temperature=0.0, + ) + return self._langchain_model + + async def complete_structured(self, prompt: str, pydantic_model, step_name: str = "LLM"): + prompt_preview = self._truncate_prompt_for_log(prompt) + self.logger.info("[%s] Structured LLM request started. Prompt: %s", step_name, prompt_preview) + start_time = time.perf_counter() + + try: + model = self._get_langchain_model() + structured = model.with_structured_output(pydantic_model, method="json_schema") + result = await structured.ainvoke(prompt) + + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.info( + "[%s] Structured LLM request completed in %.2fms", + step_name, + elapsed_ms, + ) + return result + except Exception as exc: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.error("[%s] Structured LLM error after %.2fms: %s", step_name, elapsed_ms, exc) + raise LLMClientError from exc diff --git a/backend/app/services/query_decomposer.py b/backend/app/services/query_decomposer.py index 5f0fd68..4b45093 100644 --- a/backend/app/services/query_decomposer.py +++ b/backend/app/services/query_decomposer.py @@ -4,6 +4,9 @@ This module provides a lightweight QueryDecomposer that delegates the decomposition of a natural language question into simplified sub-questions to an LLM client. Prompt templates are fetched from PromptService when available; otherwise, a built-in default is used. + +Uses LangChain structured output via LLMClient.complete_structured() +for guaranteed valid JSON, with a legacy json.loads() fallback path. """ from __future__ import annotations @@ -37,10 +40,28 @@ def _extract_json_from_markdown(response: str) -> str: return response.strip() +def _parse_legacy_json(response: str) -> List[str]: + response = _extract_json_from_markdown(response) + try: + data = json.loads(response) + except json.JSONDecodeError: + return [] + + if not isinstance(data, list): + return [] + + if len(data) == 0: + return [] + if all(isinstance(item, str) for item in data): + return data + return [str(item) for item in data] + + class QueryDecomposer: """Decompose a natural language question into simplified sub-questions. The class expects an LLM client that exposes ``async complete(prompt: str) -> str`` + and ``async complete_structured(prompt, pydantic_model) -> BaseModel``, and an optional ``PromptService`` for templated prompts. When ``prompt_service`` is ``None``, a built-in default template is used. """ @@ -52,14 +73,16 @@ class QueryDecomposer: async def decompose(self, question: str) -> Tuple[List[str], str]: """Return a list of sub-questions and the prompt used for decomposition. + Uses LangChain structured output as the primary path (guaranteed valid JSON). + Falls back to legacy json.loads() parsing if structured output fails. + Args: question: The natural language question to decompose. Returns: A tuple of (sub-questions, prompt). sub-questions is a list of - strings; prompt is the rendered prompt string. If the LLM response - is invalid or the input is empty, sub-questions will be an empty - list and prompt will be ``""`` or the prompt that was attempted. + strings; prompt is the rendered prompt string. If both structured + and legacy paths fail, sub-questions will be an empty list. """ if question is None or question.strip() == "": @@ -72,27 +95,41 @@ class QueryDecomposer: prompt = template.replace("{question}", question) + from app.models.decompose import SubQuestions + + try: + result = await self.llm_client.complete_structured( + prompt=prompt, + pydantic_model=SubQuestions, + step_name="QueryDecomposer", + ) + return result.questions, prompt + except Exception as exc: + logger.warning( + "Structured decomposition failed: %s. Falling back to legacy parse.", + exc, + ) + try: response = await self.llm_client.complete(prompt, step_name="QueryDecomposer") except Exception as exc: - logger.warning("LLM decomposition failed: %s", exc) + logger.warning("Legacy LLM decomposition also failed: %s", exc) return [], prompt if not isinstance(response, str): response = str(response) - response = _extract_json_from_markdown(response) + questions = _parse_legacy_json(response) - try: - data = json.loads(response) - except json.JSONDecodeError: - return [], prompt + if not questions: + logger.warning( + "Legacy decompose JSON parse failed. Raw response (first 500 chars): %s", + response[:500], + ) + else: + logger.info( + "Legacy decompose succeeded after structured output failure. " + "Consider investigating why structured output failed." + ) - if not isinstance(data, list): - return [], prompt - - if len(data) == 0: - return [], prompt - if all(isinstance(item, str) for item in data): - return data, prompt - return [str(item) for item in data], prompt + return questions, prompt diff --git a/backend/app/services/rag.py b/backend/app/services/rag.py index e361fb5..2d0f70a 100644 --- a/backend/app/services/rag.py +++ b/backend/app/services/rag.py @@ -252,6 +252,8 @@ class RAGService: "Each section should contain 1-5 bullet points.\n" "Cite your sources inline using bracket labels, " "e.g. [filename, page N].\n" + "Copy the exact bracket labels shown in the document chunks — " + "do not modify filenames or add/remove extensions.\n" "Place the citation at the end of each relevant bullet point." "\n\n" "{context_sections}\n\n" diff --git a/backend/app/test/test_phase5_decompose_logging.py b/backend/app/test/test_phase5_decompose_logging.py new file mode 100644 index 0000000..5a5c65e --- /dev/null +++ b/backend/app/test/test_phase5_decompose_logging.py @@ -0,0 +1,86 @@ +"""Tests for Phase 5.1 decompose logging and fallback behavior.""" +import logging +from unittest.mock import patch + +import pytest + +from app.models.decompose import SubQuestions +from app.services.llm_client import LLMClientError +from app.services.query_decomposer import QueryDecomposer + + +class MockFallbackLLMClient: + """LLM client where both structured and legacy paths fail for logging.""" + + def __init__(self, complete_response="not json"): + self._complete_response = complete_response + self.complete_called = False + self.complete_structured_called = False + + async def complete(self, prompt, temperature=0.7, step_name="LLM"): + self.complete_called = True + return self._complete_response + + async def complete_structured(self, prompt, pydantic_model, step_name="LLM"): + self.complete_structured_called = True + raise LLMClientError("structured output failed") + + +async def test_warning_logged_when_both_paths_fail(mock_prompt_service): + """When both structured and legacy paths fail, warning should be logged.""" + llm = MockFallbackLLMClient(complete_response="completely invalid response !!!") + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + with patch("app.services.query_decomposer.logger") as mock_logger: + questions, _ = await decomposer.decompose("Test question") + + assert questions == [] + + warning_calls = [ + str(call) + for call in mock_logger.warning.call_args_list + ] + assert any("Structured decomposition failed" in msg for msg in warning_calls) + assert any("Legacy decompose JSON parse failed" in msg for msg in warning_calls) + assert any("completely invalid response" in msg for msg in warning_calls) + + +async def test_warning_logged_when_structured_fails_but_legacy_succeeds(mock_prompt_service): + """When structured fails but legacy succeeds, info should be logged.""" + llm = MockFallbackLLMClient(complete_response='["q1", "q2"]') + llm._complete_response = '["q1", "q2"]' + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + with patch("app.services.query_decomposer.logger") as mock_logger: + questions, _ = await decomposer.decompose("Test question") + + assert questions == ["q1", "q2"] + + info_calls = [ + str(call) + for call in mock_logger.info.call_args_list + ] + assert any("Legacy decompose succeeded after structured output failure" in msg for msg in info_calls) + + +async def test_no_logging_when_structured_succeeds(mock_prompt_service): + """When structured output succeeds, no warning should be logged.""" + llm = MockFallbackLLMClient() + + async def successful_structured(prompt, pydantic_model, step_name="LLM"): + llm.complete_structured_called = True + return SubQuestions(questions=["Q1"]) + + llm.complete_structured = successful_structured + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + with patch("app.services.query_decomposer.logger") as mock_logger: + questions, _ = await decomposer.decompose("Test question") + + assert questions == ["Q1"] + + warning_calls = [ + str(call) + for call in mock_logger.warning.call_args_list + ] + assert not any("Structured decomposition failed" in msg for msg in warning_calls) diff --git a/backend/app/test/test_phase5_llm_client_structured.py b/backend/app/test/test_phase5_llm_client_structured.py new file mode 100644 index 0000000..c518c02 --- /dev/null +++ b/backend/app/test/test_phase5_llm_client_structured.py @@ -0,0 +1,128 @@ +"""Tests for Phase 5.1 LLMClient.complete_structured() method.""" +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from app.core.config import Settings +from app.models.decompose import SubQuestions +from app.services.llm_client import LLMClient, LLMClientError + + +class TestLLMClientStructured: + """Tests for complete_structured() — mocks the LangChain model layer.""" + + @pytest.fixture + def settings(self): + return Settings( + llm_base_url="https://openrouter.ai/api/v1", + llm_api_key="test-key", + llm_model_name="qwen/qwen3.6-35b-a3b", + ) + + @pytest.fixture + def client(self, settings): + return LLMClient(settings) + + @pytest.mark.asyncio + async def test_complete_structured_returns_validated_model(self, client): + """complete_structured() should return a validated Pydantic model.""" + expected = SubQuestions(questions=["Q1", "Q2", "Q3"]) + + mock_structured = AsyncMock() + mock_structured.ainvoke = AsyncMock(return_value=expected) + + mock_model = MagicMock() + mock_model.with_structured_output.return_value = mock_structured + + with patch.object(client, "_langchain_model", mock_model): + result = await client.complete_structured( + prompt="Test prompt", + pydantic_model=SubQuestions, + step_name="QueryDecomposer", + ) + + assert result == expected + assert result.questions == ["Q1", "Q2", "Q3"] + mock_model.with_structured_output.assert_called_once_with( + SubQuestions, method="json_schema" + ) + mock_structured.ainvoke.assert_called_once() + + @pytest.mark.asyncio + async def test_complete_structured_passes_prompt_correctly(self, client): + """The prompt should be forwarded to the LangChain model.""" + prompt = "Decompose this: 'What is NEC4?'" + expected = SubQuestions(questions=["What is NEC4?"]) + + mock_structured = AsyncMock() + mock_structured.ainvoke = AsyncMock(return_value=expected) + + mock_model = MagicMock() + mock_model.with_structured_output.return_value = mock_structured + + with patch.object(client, "_langchain_model", mock_model): + await client.complete_structured( + prompt=prompt, + pydantic_model=SubQuestions, + ) + + actual_prompt = mock_structured.ainvoke.call_args[0][0] + assert actual_prompt == prompt + + @pytest.mark.asyncio + async def test_complete_structured_wraps_errors(self, client): + """LangChain errors should be wrapped in LLMClientError.""" + mock_structured = AsyncMock() + mock_structured.ainvoke = AsyncMock( + side_effect=RuntimeError("LangChain exploded") + ) + + mock_model = MagicMock() + mock_model.with_structured_output.return_value = mock_structured + + with patch.object(client, "_langchain_model", mock_model): + with pytest.raises(LLMClientError): + await client.complete_structured( + prompt="Test", + pydantic_model=SubQuestions, + ) + + @pytest.mark.asyncio + async def test_complete_structured_runs_without_error(self, client): + """complete_structured() should complete successfully with mocked model.""" + expected = SubQuestions(questions=["Q1"]) + + mock_structured = AsyncMock() + mock_structured.ainvoke = AsyncMock(return_value=expected) + + mock_model = MagicMock() + mock_model.with_structured_output.return_value = mock_structured + + with patch.object(client, "_langchain_model", mock_model): + result = await client.complete_structured( + prompt="Test", + pydantic_model=SubQuestions, + step_name="TestStep", + ) + + assert result == expected + + @pytest.mark.asyncio + async def test_complete_structured_lazy_inits_langchain_model(self, client): + """The LangChain model should be lazily initialized on first call.""" + expected = SubQuestions(questions=["Q1"]) + + mock_structured = AsyncMock() + mock_structured.ainvoke = AsyncMock(return_value=expected) + + mock_model = MagicMock() + mock_model.with_structured_output.return_value = mock_structured + + with patch("langchain.chat_models.init_chat_model", return_value=mock_model): + client._langchain_model = None + result = await client.complete_structured( + prompt="Test", + pydantic_model=SubQuestions, + ) + + assert result == expected diff --git a/backend/app/test/test_phase5_query_decomposer_structured.py b/backend/app/test/test_phase5_query_decomposer_structured.py new file mode 100644 index 0000000..6cc8872 --- /dev/null +++ b/backend/app/test/test_phase5_query_decomposer_structured.py @@ -0,0 +1,130 @@ +"""Tests for Phase 5.1 QueryDecomposer with structured output.""" +import asyncio +import logging +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from app.models.decompose import SubQuestions +from app.services.llm_client import LLMClientError +from app.services.query_decomposer import QueryDecomposer + + +class MockLLMClientStructured: + """Mock LLMClient that supports both complete() and complete_structured().""" + + def __init__(self, complete_response="", structured_result=None, structured_error=None): + self._complete_response = complete_response + self._structured_result = structured_result + self._structured_error = structured_error + self.last_prompt = None + self.last_step_name = None + self.complete_called = False + self.complete_structured_called = False + + async def complete(self, prompt, temperature=0.7, step_name="LLM"): + self.last_prompt = prompt + self.last_step_name = step_name + self.complete_called = True + return self._complete_response + + async def complete_structured(self, prompt, pydantic_model, step_name="LLM"): + self.last_prompt = prompt + self.last_step_name = step_name + self.complete_structured_called = True + if self._structured_error: + raise self._structured_error + return self._structured_result + + +async def test_decompose_structured_happy_path(mock_prompt_service): + """Structured output should return validated sub-questions.""" + expected = SubQuestions(questions=["Q1", "Q2", "Q3"]) + llm = MockLLMClientStructured(structured_result=expected) + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + questions, prompt = await decomposer.decompose("Test question") + + assert questions == ["Q1", "Q2", "Q3"] + assert prompt is not None + assert llm.complete_structured_called + assert not llm.complete_called + + +async def test_decompose_structured_single_question(mock_prompt_service): + """A single sub-question should be valid.""" + expected = SubQuestions(questions=["Only one question"]) + llm = MockLLMClientStructured(structured_result=expected) + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + questions, _ = await decomposer.decompose("Simple question") + + assert questions == ["Only one question"] + + +async def test_decompose_falls_back_to_legacy_on_structured_error(mock_prompt_service): + """When complete_structured() fails, fall back to complete() + json.loads().""" + llm = MockLLMClientStructured( + complete_response='["legacy_q1", "legacy_q2"]', + structured_error=LLMClientError("structured failed"), + ) + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + questions, _ = await decomposer.decompose("Test question") + + assert questions == ["legacy_q1", "legacy_q2"] + assert llm.complete_structured_called + assert llm.complete_called + + +async def test_decompose_returns_empty_on_both_paths_failure(mock_prompt_service): + """When both structured and legacy paths fail, return empty list.""" + llm = MockLLMClientStructured( + complete_response="not valid json at all", + structured_error=LLMClientError("structured failed"), + ) + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + with patch("app.services.query_decomposer.logger") as mock_logger: + questions, _ = await decomposer.decompose("Test question") + + assert questions == [] + assert llm.complete_structured_called + assert llm.complete_called + + +async def test_decompose_empty_question_returns_empty(mock_prompt_service): + """Empty or None question should return empty list immediately.""" + llm = MockLLMClientStructured(structured_result=SubQuestions(questions=["Q1"])) + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + questions, prompt = await decomposer.decompose("") + + assert questions == [] + assert prompt == "" + assert not llm.complete_structured_called + assert not llm.complete_called + + +async def test_decompose_structured_with_builtin_prompt(): + """When no prompt_service, use built-in fallback template.""" + expected = SubQuestions(questions=["Q1"]) + llm = MockLLMClientStructured(structured_result=expected) + decomposer = QueryDecomposer(llm, prompt_service=None) + + questions, prompt = await decomposer.decompose("What is X?") + + assert questions == ["Q1"] + assert "What is X?" in prompt + assert llm.complete_structured_called + + +async def test_decompose_structured_cantonese(mock_prompt_service): + """Cantonese questions should work with structured output.""" + expected = SubQuestions(questions=["問題一", "問題二"]) + llm = MockLLMClientStructured(structured_result=expected) + decomposer = QueryDecomposer(llm, prompt_service=mock_prompt_service) + + questions, _ = await decomposer.decompose("立法會今日討論咗咩?") + + assert questions == ["問題一", "問題二"] diff --git a/backend/app/test/test_phase5_subquestions_model.py b/backend/app/test/test_phase5_subquestions_model.py new file mode 100644 index 0000000..082cf89 --- /dev/null +++ b/backend/app/test/test_phase5_subquestions_model.py @@ -0,0 +1,99 @@ +"""Tests for Phase 5.1 SubQuestions Pydantic model. + +Validates the SubQuestions model used with LangChain with_structured_output(). +Ensures proper validation of structured LLM responses. +""" + +import pytest +from pydantic import ValidationError + + +class TestSubQuestionsModel: + """Pydantic model tests for SubQuestions — no LLM calls needed.""" + + def test_valid_subquestions(self): + """SubQuestions should accept a valid list of 2-5 strings.""" + from app.models.decompose import SubQuestions + + sq = SubQuestions(questions=["What is A?", "What is B?"]) + assert sq.questions == ["What is A?", "What is B?"] + assert len(sq.questions) == 2 + + def test_min_length_one(self): + """A single sub-question should be valid (min_length=1).""" + from app.models.decompose import SubQuestions + + sq = SubQuestions(questions=["Single question"]) + assert len(sq.questions) == 1 + + def test_max_length_five(self): + """Up to 5 sub-questions should be valid (max_length=5).""" + from app.models.decompose import SubQuestions + + sq = SubQuestions(questions=[f"Q{i}" for i in range(5)]) + assert len(sq.questions) == 5 + + def test_empty_list_rejected(self): + """Empty list should be rejected by min_length=1 constraint.""" + from app.models.decompose import SubQuestions + + with pytest.raises(ValidationError, match="questions"): + SubQuestions(questions=[]) + + def test_zero_questions_rejected(self): + """Empty list should be rejected (same as above, explicit).""" + from app.models.decompose import SubQuestions + + with pytest.raises(ValidationError, match="questions"): + SubQuestions(questions=[]) + + def test_too_many_questions_rejected(self): + """More than 5 questions should be rejected by max_length=5.""" + from app.models.decompose import SubQuestions + + with pytest.raises(ValidationError, match="questions"): + SubQuestions(questions=[f"Q{i}" for i in range(10)]) + + def test_non_string_items_rejected(self): + """Non-string items in the list should be rejected.""" + from app.models.decompose import SubQuestions + + with pytest.raises(ValidationError): + SubQuestions(questions=[1, 2, 3]) + + def test_missing_field_rejected(self): + """Missing 'questions' field should be rejected.""" + from app.models.decompose import SubQuestions + + with pytest.raises(ValidationError): + SubQuestions() # type: ignore + + def test_wrong_type_rejected(self): + """Passing a string instead of a list should be rejected.""" + from app.models.decompose import SubQuestions + + with pytest.raises(ValidationError): + SubQuestions(questions="not a list") # type: ignore + + def test_json_schema_generation(self): + """JSON schema should be valid for structured output.""" + from app.models.decompose import SubQuestions + + schema = SubQuestions.model_json_schema() + assert schema["type"] == "object" + assert "questions" in schema["properties"] + assert schema["properties"]["questions"]["type"] == "array" + assert schema["properties"]["questions"]["items"]["type"] == "string" + + def test_cantonese_questions_accepted(self): + """Cantonese/Chinese text should be accepted as valid strings.""" + from app.models.decompose import SubQuestions + + sq = SubQuestions( + questions=[ + "立法會今日討論什麼議題?", + "有咩重要決定?", + ] + ) + assert len(sq.questions) == 2 + assert all(isinstance(q, str) for q in sq.questions) diff --git a/frontend/src/pages/RAGDatabasePage.tsx b/frontend/src/pages/RAGDatabasePage.tsx index 2d13ccd..5cba000 100644 --- a/frontend/src/pages/RAGDatabasePage.tsx +++ b/frontend/src/pages/RAGDatabasePage.tsx @@ -1,4 +1,4 @@ -import React, { useState, useCallback } from 'react' +import React, { useState, useCallback, useMemo } from 'react' import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react' import { useQueryClient } from '@tanstack/react-query' import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries' @@ -12,8 +12,15 @@ interface FileUploadEntry { error?: string } +function getDocumentIdFromUrl(): string | null { + if (typeof window === 'undefined') return null + const params = new URLSearchParams(window.location.search) + return params.get('document') +} + export const RAGDatabasePage: React.FC = () => { - const [expandedId, setExpandedId] = useState(null) + const initialDocId = useMemo(() => getDocumentIdFromUrl(), []) + const [expandedId, setExpandedId] = useState(initialDocId) const [uploadEntries, setUploadEntries] = useState([]) const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments() diff --git a/frontend/src/test/utils/citationParser.test.ts b/frontend/src/test/utils/citationParser.test.ts index 206efd6..50d5cd7 100644 --- a/frontend/src/test/utils/citationParser.test.ts +++ b/frontend/src/test/utils/citationParser.test.ts @@ -10,6 +10,7 @@ const mockSources: SourceMetadata[] = [ chunk_index: 0, page_number: 3, chunk_file_path: 'chunk_0.pdf', + document_id: 'doc-001', }, { filename: 'meeting_notes.docx', @@ -18,6 +19,7 @@ const mockSources: SourceMetadata[] = [ chunk_index: 1, page_number: null, chunk_file_path: 'chunk_1.pdf', + document_id: 'doc-002', }, { filename: 'report.pdf', @@ -26,6 +28,7 @@ const mockSources: SourceMetadata[] = [ chunk_index: 2, page_number: 5, chunk_file_path: 'chunk_2.pdf', + document_id: 'doc-003', }, ] @@ -89,7 +92,7 @@ describe('processCitations', () => { expect(result).toBe(text) }) - it('skips sources without chunk_file_path', () => { + it('generates fallback document link when chunk_file_path is null', () => { const sourcesWithoutPath = [ { filename: 'no_path.pdf', @@ -98,10 +101,47 @@ describe('processCitations', () => { chunk_index: 0, page_number: 1, chunk_file_path: null, + document_id: 'doc-fallback', }, ] - const text = 'Source [no_path.pdf, page 1] missing path.' + const text = 'Source [no_path.pdf, page 1] with document link.' const result = processCitations(text, sourcesWithoutPath) + expect(result).toContain('/rag-database?document=doc-fallback') + }) + + it('remains plain text when both chunk_file_path and document_id are null', () => { + const sourcesWithoutBoth = [ + { + filename: 'orphan.pdf', + upload_date: '2024-01-18', + content_summary: 'Summary', + chunk_index: 0, + page_number: 1, + chunk_file_path: null, + document_id: null, + }, + ] + const text = 'Source [orphan.pdf, page 1] completely missing.' + const result = processCitations(text, sourcesWithoutBoth) expect(result).toBe(text) }) + + it('fuzzy matches citation without file extension', () => { + const text = 'See [NEC4 ACC, page 3] without .pdf extension.' + const result = processCitations(text, mockSources) + expect(result).toContain('/pdf-viewer') + expect(result).toContain('NEC4 ACC, page 3') + }) + + it('fuzzy matches citation with different case and without extension', () => { + const text = 'Cite [nec4 acc] lowercase no extension.' + const result = processCitations(text, mockSources) + expect(result).toContain('/pdf-viewer') + }) + + it('fuzzy matches filename-only citation without extension', () => { + const text = 'Notes [meeting_notes] from meeting (no .docx).' + const result = processCitations(text, mockSources) + expect(result).toContain('/pdf-viewer') + }) }) diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index f63dc05..a16d574 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -5,6 +5,7 @@ export interface SourceMetadata { chunk_index: number page_number: number | null chunk_file_path: string | null + document_id: string | null } export interface SubQuestionSources { diff --git a/frontend/src/utils/citationParser.ts b/frontend/src/utils/citationParser.ts index 763e156..084af38 100644 --- a/frontend/src/utils/citationParser.ts +++ b/frontend/src/utils/citationParser.ts @@ -1,14 +1,29 @@ import type { SourceMetadata, SubQuestionSources } from '../types' import { getPdfViewerUrl } from '../lib/api' +const SUPPORTED_EXTENSIONS = /\.(pdf|docx|txt)$/i + +function stripExtension(filename: string): string { + return filename.replace(SUPPORTED_EXTENSIONS, '').trim() +} + function buildCitationLookup(sources: SourceMetadata[]): Map { const lookup = new Map() for (const source of sources) { + const fname = source.filename.trim() + + lookup.set(fname.toLowerCase(), source) if (source.page_number !== null) { - const keyWithPage = `${source.filename}, page ${source.page_number}` - lookup.set(keyWithPage.toLowerCase(), source) + lookup.set(`${fname}, page ${source.page_number}`.toLowerCase(), source) + } + + const stripped = stripExtension(fname) + if (stripped !== fname) { + lookup.set(stripped.toLowerCase(), source) + if (source.page_number !== null) { + lookup.set(`${stripped}, page ${source.page_number}`.toLowerCase(), source) + } } - lookup.set(source.filename.toLowerCase(), source) } return lookup } @@ -30,6 +45,45 @@ export function processCitationsForSubq( return replaceCitationPatterns(answerSection, lookup) } +function buildCitationUrl(source: SourceMetadata): string | null { + if (source.chunk_file_path) { + return getPdfViewerUrl( + source.chunk_file_path, + source.page_number ?? undefined, + source.filename + ) + } + if (source.document_id) { + return `/rag-database?document=${encodeURIComponent(source.document_id)}` + } + return null +} + +function findSource( + citationText: string, + lookup: Map +): SourceMetadata | undefined { + const trimmed = citationText.trim().toLowerCase() + + let source = lookup.get(trimmed) + + if (!source) { + const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i) + if (pageMatch) { + source = lookup.get(pageMatch[1].trim().toLowerCase()) + } + } + + if (!source) { + const strippedCitation = stripExtension(trimmed) + if (strippedCitation !== trimmed) { + source = lookup.get(strippedCitation) + } + } + + return source +} + function replaceCitationPatterns( text: string, lookup: Map @@ -38,41 +92,19 @@ function replaceCitationPatterns( return text.replace(citationPattern, (fullMatch, content: string) => { const trimmed = content.trim() + const source = findSource(trimmed, lookup) - let source = lookup.get(trimmed.toLowerCase()) - - if (!source) { - const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i) - if (pageMatch) { - const filename = pageMatch[1].trim() - source = lookup.get(filename.toLowerCase()) + if (source) { + const url = buildCitationUrl(source) + if (url) { + return `[${trimmed}](${url})` } } - if (source?.chunk_file_path) { - const url = getPdfViewerUrl( - source.chunk_file_path, - source.page_number ?? undefined, - source.filename - ) - return `[${trimmed}](${url})` - } - return fullMatch }) } -/** - * Parse citation patterns in answer text and replace with markdown links. - * - * Citation format: [filename, page N] or [filename] - * Only replaces citations that match an actual source in the sources array. - * Unmatched citations remain as plain text. - * - * @param text - The LLM answer text containing citations - * @param sources - Array of source metadata for cross-referencing - * @returns Modified text with matched citations converted to markdown links - */ export function processCitations(text: string, sources: SourceMetadata[]): string { if (!sources.length) return text