From 9095432806af21c141b7e9eb2366b43bb9e7302f Mon Sep 17 00:00:00 2001 From: Woody Date: Fri, 24 Apr 2026 17:53:10 +0800 Subject: [PATCH] feat(frontend): add citation parser utility with tests (sub-phase 2.6) processCitations() parses [filename, page N] patterns from LLM answers. Cross-references with sources[] array to build clickable markdown links. Graceful fallback: unmatched citations remain as plain text. Handles markdown images/links, case-insensitive matching, DOCX without pages. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- .../src/test/utils/citationParser.test.ts | 107 ++++++++++++++++++ frontend/src/utils/citationParser.ts | 59 ++++++++++ 2 files changed, 166 insertions(+) create mode 100644 frontend/src/test/utils/citationParser.test.ts create mode 100644 frontend/src/utils/citationParser.ts diff --git a/frontend/src/test/utils/citationParser.test.ts b/frontend/src/test/utils/citationParser.test.ts new file mode 100644 index 0000000..206efd6 --- /dev/null +++ b/frontend/src/test/utils/citationParser.test.ts @@ -0,0 +1,107 @@ +import { describe, it, expect } from 'vitest' +import { processCitations } from '../../utils/citationParser' +import type { SourceMetadata } from '../../types' + +const mockSources: SourceMetadata[] = [ + { + filename: 'NEC4 ACC.pdf', + upload_date: '2024-01-15', + content_summary: 'Summary', + chunk_index: 0, + page_number: 3, + chunk_file_path: 'chunk_0.pdf', + }, + { + filename: 'meeting_notes.docx', + upload_date: '2024-01-16', + content_summary: 'Minutes', + chunk_index: 1, + page_number: null, + chunk_file_path: 'chunk_1.pdf', + }, + { + filename: 'report.pdf', + upload_date: '2024-01-17', + content_summary: 'Report', + chunk_index: 2, + page_number: 5, + chunk_file_path: 'chunk_2.pdf', + }, +] + +describe('processCitations', () => { + it('returns original text when no sources provided', () => { + const text = 'This has [NEC4 ACC.pdf, page 3] citation.' + expect(processCitations(text, [])).toBe(text) + }) + + it('replaces matched citation with markdown link', () => { + const text = 'Clause info [NEC4 ACC.pdf, page 3] is important.' + const result = processCitations(text, mockSources) + expect(result).toContain('](') + expect(result).toContain('/pdf-viewer') + expect(result).toMatch(/\[NEC4 ACC\.pdf, page 3\]\([^)]+\)/) + }) + + it('handles filename-only citation for DOCX (no page)', () => { + const text = 'Notes [meeting_notes.docx] from meeting.' + const result = processCitations(text, mockSources) + expect(result).toContain('/pdf-viewer') + expect(result).toContain('meeting_notes.docx') + }) + + it('leaves unmatched citations as plain text', () => { + const text = 'Unknown source [unknown_file.pdf, page 10] here.' + const result = processCitations(text, mockSources) + expect(result).toBe(text) + }) + + it('handles multiple citations in same text', () => { + const text = 'A [NEC4 ACC.pdf, page 3] and B [report.pdf, page 5].' + const result = processCitations(text, mockSources) + const linkCount = (result.match(/\[.+?\]\(/g) || []).length + expect(linkCount).toBe(2) + }) + + it('does not break existing markdown links', () => { + const text = 'See [label](http://example.com) and [NEC4 ACC.pdf, page 3].' + const result = processCitations(text, mockSources) + expect(result).toContain('[label](http://example.com)') + expect(result).toContain('/pdf-viewer') + }) + + it('does not break markdown images', () => { + const text = '![diagram](http://example.com/img.png) and [NEC4 ACC.pdf, page 3].' + const result = processCitations(text, mockSources) + expect(result).toContain('![diagram]') + expect(result).toContain('/pdf-viewer') + }) + + it('matches case-insensitively', () => { + const text = 'Cite [nec4 acc.pdf, page 3] lowercase.' + const result = processCitations(text, mockSources) + expect(result).toContain('/pdf-viewer') + }) + + it('leaves plain bracket text without matching source', () => { + const text = 'Some [plain bracket text] without source.' + const result = processCitations(text, mockSources) + expect(result).toBe(text) + }) + + it('skips sources without chunk_file_path', () => { + const sourcesWithoutPath = [ + { + filename: 'no_path.pdf', + upload_date: '2024-01-18', + content_summary: 'Summary', + chunk_index: 0, + page_number: 1, + chunk_file_path: null, + }, + ] + const text = 'Source [no_path.pdf, page 1] missing path.' + const result = processCitations(text, sourcesWithoutPath) + expect(result).toBe(text) + }) +}) diff --git a/frontend/src/utils/citationParser.ts b/frontend/src/utils/citationParser.ts new file mode 100644 index 0000000..14715f0 --- /dev/null +++ b/frontend/src/utils/citationParser.ts @@ -0,0 +1,59 @@ +import type { SourceMetadata } from '../types' +import { getPdfViewerUrl } from '../lib/api' + +function buildCitationLookup(sources: SourceMetadata[]): Map { + const lookup = new Map() + for (const source of sources) { + if (source.page_number !== null) { + const keyWithPage = `${source.filename}, page ${source.page_number}` + lookup.set(keyWithPage.toLowerCase(), source) + } + lookup.set(source.filename.toLowerCase(), source) + } + return lookup +} + +/** + * Parse citation patterns in answer text and replace with markdown links. + * + * Citation format: [filename, page N] or [filename] + * Only replaces citations that match an actual source in the sources array. + * Unmatched citations remain as plain text. + * + * @param text - The LLM answer text containing citations + * @param sources - Array of source metadata for cross-referencing + * @returns Modified text with matched citations converted to markdown links + */ +export function processCitations(text: string, sources: SourceMetadata[]): string { + if (!sources.length) return text + + const lookup = buildCitationLookup(sources) + + // Match [content] that is NOT part of markdown image ![...] or link [...](...) + const citationPattern = /(? { + const trimmed = content.trim() + + let source = lookup.get(trimmed.toLowerCase()) + + if (!source) { + const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i) + if (pageMatch) { + const filename = pageMatch[1].trim() + source = lookup.get(filename.toLowerCase()) + } + } + + if (source?.chunk_file_path) { + const url = getPdfViewerUrl( + source.chunk_file_path, + source.page_number ?? undefined, + source.filename + ) + return `[${trimmed}](${url})` + } + + return fullMatch + }) +}