feat(frontend): add citation parser utility with tests (sub-phase 2.6)
processCitations() parses [filename, page N] patterns from LLM answers. Cross-references with sources[] array to build clickable markdown links. Graceful fallback: unmatched citations remain as plain text. Handles markdown images/links, case-insensitive matching, DOCX without pages. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
e78b670baa
commit
9095432806
|
|
@ -0,0 +1,107 @@
|
||||||
|
import { describe, it, expect } from 'vitest'
|
||||||
|
import { processCitations } from '../../utils/citationParser'
|
||||||
|
import type { SourceMetadata } from '../../types'
|
||||||
|
|
||||||
|
const mockSources: SourceMetadata[] = [
|
||||||
|
{
|
||||||
|
filename: 'NEC4 ACC.pdf',
|
||||||
|
upload_date: '2024-01-15',
|
||||||
|
content_summary: 'Summary',
|
||||||
|
chunk_index: 0,
|
||||||
|
page_number: 3,
|
||||||
|
chunk_file_path: 'chunk_0.pdf',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
filename: 'meeting_notes.docx',
|
||||||
|
upload_date: '2024-01-16',
|
||||||
|
content_summary: 'Minutes',
|
||||||
|
chunk_index: 1,
|
||||||
|
page_number: null,
|
||||||
|
chunk_file_path: 'chunk_1.pdf',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
filename: 'report.pdf',
|
||||||
|
upload_date: '2024-01-17',
|
||||||
|
content_summary: 'Report',
|
||||||
|
chunk_index: 2,
|
||||||
|
page_number: 5,
|
||||||
|
chunk_file_path: 'chunk_2.pdf',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
describe('processCitations', () => {
|
||||||
|
it('returns original text when no sources provided', () => {
|
||||||
|
const text = 'This has [NEC4 ACC.pdf, page 3] citation.'
|
||||||
|
expect(processCitations(text, [])).toBe(text)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('replaces matched citation with markdown link', () => {
|
||||||
|
const text = 'Clause info [NEC4 ACC.pdf, page 3] is important.'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toContain('](')
|
||||||
|
expect(result).toContain('/pdf-viewer')
|
||||||
|
expect(result).toMatch(/\[NEC4 ACC\.pdf, page 3\]\([^)]+\)/)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('handles filename-only citation for DOCX (no page)', () => {
|
||||||
|
const text = 'Notes [meeting_notes.docx] from meeting.'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toContain('/pdf-viewer')
|
||||||
|
expect(result).toContain('meeting_notes.docx')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('leaves unmatched citations as plain text', () => {
|
||||||
|
const text = 'Unknown source [unknown_file.pdf, page 10] here.'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toBe(text)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('handles multiple citations in same text', () => {
|
||||||
|
const text = 'A [NEC4 ACC.pdf, page 3] and B [report.pdf, page 5].'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
const linkCount = (result.match(/\[.+?\]\(/g) || []).length
|
||||||
|
expect(linkCount).toBe(2)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('does not break existing markdown links', () => {
|
||||||
|
const text = 'See [label](http://example.com) and [NEC4 ACC.pdf, page 3].'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toContain('[label](http://example.com)')
|
||||||
|
expect(result).toContain('/pdf-viewer')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('does not break markdown images', () => {
|
||||||
|
const text = ' and [NEC4 ACC.pdf, page 3].'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toContain('![diagram]')
|
||||||
|
expect(result).toContain('/pdf-viewer')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('matches case-insensitively', () => {
|
||||||
|
const text = 'Cite [nec4 acc.pdf, page 3] lowercase.'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toContain('/pdf-viewer')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('leaves plain bracket text without matching source', () => {
|
||||||
|
const text = 'Some [plain bracket text] without source.'
|
||||||
|
const result = processCitations(text, mockSources)
|
||||||
|
expect(result).toBe(text)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('skips sources without chunk_file_path', () => {
|
||||||
|
const sourcesWithoutPath = [
|
||||||
|
{
|
||||||
|
filename: 'no_path.pdf',
|
||||||
|
upload_date: '2024-01-18',
|
||||||
|
content_summary: 'Summary',
|
||||||
|
chunk_index: 0,
|
||||||
|
page_number: 1,
|
||||||
|
chunk_file_path: null,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
const text = 'Source [no_path.pdf, page 1] missing path.'
|
||||||
|
const result = processCitations(text, sourcesWithoutPath)
|
||||||
|
expect(result).toBe(text)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
import type { SourceMetadata } from '../types'
|
||||||
|
import { getPdfViewerUrl } from '../lib/api'
|
||||||
|
|
||||||
|
function buildCitationLookup(sources: SourceMetadata[]): Map<string, SourceMetadata> {
|
||||||
|
const lookup = new Map<string, SourceMetadata>()
|
||||||
|
for (const source of sources) {
|
||||||
|
if (source.page_number !== null) {
|
||||||
|
const keyWithPage = `${source.filename}, page ${source.page_number}`
|
||||||
|
lookup.set(keyWithPage.toLowerCase(), source)
|
||||||
|
}
|
||||||
|
lookup.set(source.filename.toLowerCase(), source)
|
||||||
|
}
|
||||||
|
return lookup
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse citation patterns in answer text and replace with markdown links.
|
||||||
|
*
|
||||||
|
* Citation format: [filename, page N] or [filename]
|
||||||
|
* Only replaces citations that match an actual source in the sources array.
|
||||||
|
* Unmatched citations remain as plain text.
|
||||||
|
*
|
||||||
|
* @param text - The LLM answer text containing citations
|
||||||
|
* @param sources - Array of source metadata for cross-referencing
|
||||||
|
* @returns Modified text with matched citations converted to markdown links
|
||||||
|
*/
|
||||||
|
export function processCitations(text: string, sources: SourceMetadata[]): string {
|
||||||
|
if (!sources.length) return text
|
||||||
|
|
||||||
|
const lookup = buildCitationLookup(sources)
|
||||||
|
|
||||||
|
// Match [content] that is NOT part of markdown image ![...] or link [...](...)
|
||||||
|
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
|
||||||
|
|
||||||
|
return text.replace(citationPattern, (fullMatch, content: string) => {
|
||||||
|
const trimmed = content.trim()
|
||||||
|
|
||||||
|
let source = lookup.get(trimmed.toLowerCase())
|
||||||
|
|
||||||
|
if (!source) {
|
||||||
|
const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i)
|
||||||
|
if (pageMatch) {
|
||||||
|
const filename = pageMatch[1].trim()
|
||||||
|
source = lookup.get(filename.toLowerCase())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (source?.chunk_file_path) {
|
||||||
|
const url = getPdfViewerUrl(
|
||||||
|
source.chunk_file_path,
|
||||||
|
source.page_number ?? undefined,
|
||||||
|
source.filename
|
||||||
|
)
|
||||||
|
return `[${trimmed}](${url})`
|
||||||
|
}
|
||||||
|
|
||||||
|
return fullMatch
|
||||||
|
})
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue