From 9095432806af21c141b7e9eb2366b43bb9e7302f Mon Sep 17 00:00:00 2001
From: Woody <woody.ck.tse@gmail.com>
Date: Fri, 24 Apr 2026 17:53:10 +0800
Subject: [PATCH] feat(frontend): add citation parser utility with tests
 (sub-phase 2.6)

processCitations() parses [filename, page N] patterns from LLM answers.
Cross-references with sources[] array to build clickable markdown links.
Graceful fallback: unmatched citations remain as plain text.
Handles markdown images/links, case-insensitive matching, DOCX without pages.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 .../src/test/utils/citationParser.test.ts     | 107 ++++++++++++++++++
 frontend/src/utils/citationParser.ts          |  59 ++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 frontend/src/test/utils/citationParser.test.ts
 create mode 100644 frontend/src/utils/citationParser.ts

diff --git a/frontend/src/test/utils/citationParser.test.ts b/frontend/src/test/utils/citationParser.test.ts
new file mode 100644
index 0000000..206efd6
--- /dev/null
+++ b/frontend/src/test/utils/citationParser.test.ts
@@ -0,0 +1,107 @@
+import { describe, it, expect } from 'vitest'
+import { processCitations } from '../../utils/citationParser'
+import type { SourceMetadata } from '../../types'
+
+const mockSources: SourceMetadata[] = [
+  {
+    filename: 'NEC4 ACC.pdf',
+    upload_date: '2024-01-15',
+    content_summary: 'Summary',
+    chunk_index: 0,
+    page_number: 3,
+    chunk_file_path: 'chunk_0.pdf',
+  },
+  {
+    filename: 'meeting_notes.docx',
+    upload_date: '2024-01-16',
+    content_summary: 'Minutes',
+    chunk_index: 1,
+    page_number: null,
+    chunk_file_path: 'chunk_1.pdf',
+  },
+  {
+    filename: 'report.pdf',
+    upload_date: '2024-01-17',
+    content_summary: 'Report',
+    chunk_index: 2,
+    page_number: 5,
+    chunk_file_path: 'chunk_2.pdf',
+  },
+]
+
+describe('processCitations', () => {
+  it('returns original text when no sources provided', () => {
+    const text = 'This has [NEC4 ACC.pdf, page 3] citation.'
+    expect(processCitations(text, [])).toBe(text)
+  })
+
+  it('replaces matched citation with markdown link', () => {
+    const text = 'Clause info [NEC4 ACC.pdf, page 3] is important.'
+    const result = processCitations(text, mockSources)
+    expect(result).toContain('](')
+    expect(result).toContain('/pdf-viewer')
+    expect(result).toMatch(/\[NEC4 ACC\.pdf, page 3\]\([^)]+\)/)
+  })
+
+  it('handles filename-only citation for DOCX (no page)', () => {
+    const text = 'Notes [meeting_notes.docx] from meeting.'
+    const result = processCitations(text, mockSources)
+    expect(result).toContain('/pdf-viewer')
+    expect(result).toContain('meeting_notes.docx')
+  })
+
+  it('leaves unmatched citations as plain text', () => {
+    const text = 'Unknown source [unknown_file.pdf, page 10] here.'
+    const result = processCitations(text, mockSources)
+    expect(result).toBe(text)
+  })
+
+  it('handles multiple citations in same text', () => {
+    const text = 'A [NEC4 ACC.pdf, page 3] and B [report.pdf, page 5].'
+    const result = processCitations(text, mockSources)
+    const linkCount = (result.match(/\[.+?\]\(/g) || []).length
+    expect(linkCount).toBe(2)
+  })
+
+  it('does not break existing markdown links', () => {
+    const text = 'See [label](http://example.com) and [NEC4 ACC.pdf, page 3].'
+    const result = processCitations(text, mockSources)
+    expect(result).toContain('[label](http://example.com)')
+    expect(result).toContain('/pdf-viewer')
+  })
+
+  it('does not break markdown images', () => {
+    const text = '![diagram](http://example.com/img.png) and [NEC4 ACC.pdf, page 3].'
+    const result = processCitations(text, mockSources)
+    expect(result).toContain('![diagram]')
+    expect(result).toContain('/pdf-viewer')
+  })
+
+  it('matches case-insensitively', () => {
+    const text = 'Cite [nec4 acc.pdf, page 3] lowercase.'
+    const result = processCitations(text, mockSources)
+    expect(result).toContain('/pdf-viewer')
+  })
+
+  it('leaves plain bracket text without matching source', () => {
+    const text = 'Some [plain bracket text] without source.'
+    const result = processCitations(text, mockSources)
+    expect(result).toBe(text)
+  })
+
+  it('skips sources without chunk_file_path', () => {
+    const sourcesWithoutPath = [
+      {
+        filename: 'no_path.pdf',
+        upload_date: '2024-01-18',
+        content_summary: 'Summary',
+        chunk_index: 0,
+        page_number: 1,
+        chunk_file_path: null,
+      },
+    ]
+    const text = 'Source [no_path.pdf, page 1] missing path.'
+    const result = processCitations(text, sourcesWithoutPath)
+    expect(result).toBe(text)
+  })
+})
diff --git a/frontend/src/utils/citationParser.ts b/frontend/src/utils/citationParser.ts
new file mode 100644
index 0000000..14715f0
--- /dev/null
+++ b/frontend/src/utils/citationParser.ts
@@ -0,0 +1,59 @@
+import type { SourceMetadata } from '../types'
+import { getPdfViewerUrl } from '../lib/api'
+
+function buildCitationLookup(sources: SourceMetadata[]): Map<string, SourceMetadata> {
+  const lookup = new Map<string, SourceMetadata>()
+  for (const source of sources) {
+    if (source.page_number !== null) {
+      const keyWithPage = `${source.filename}, page ${source.page_number}`
+      lookup.set(keyWithPage.toLowerCase(), source)
+    }
+    lookup.set(source.filename.toLowerCase(), source)
+  }
+  return lookup
+}
+
+/**
+ * Parse citation patterns in answer text and replace with markdown links.
+ *
+ * Citation format: [filename, page N] or [filename]
+ * Only replaces citations that match an actual source in the sources array.
+ * Unmatched citations remain as plain text.
+ *
+ * @param text - The LLM answer text containing citations
+ * @param sources - Array of source metadata for cross-referencing
+ * @returns Modified text with matched citations converted to markdown links
+ */
+export function processCitations(text: string, sources: SourceMetadata[]): string {
+  if (!sources.length) return text
+
+  const lookup = buildCitationLookup(sources)
+
+  // Match [content] that is NOT part of markdown image ![...] or link [...](...)
+  const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
+
+  return text.replace(citationPattern, (fullMatch, content: string) => {
+    const trimmed = content.trim()
+
+    let source = lookup.get(trimmed.toLowerCase())
+
+    if (!source) {
+      const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i)
+      if (pageMatch) {
+        const filename = pageMatch[1].trim()
+        source = lookup.get(filename.toLowerCase())
+      }
+    }
+
+    if (source?.chunk_file_path) {
+      const url = getPdfViewerUrl(
+        source.chunk_file_path,
+        source.page_number ?? undefined,
+        source.filename
+      )
+      return `[${trimmed}](${url})`
+    }
+
+    return fullMatch
+  })
+}