legco_ai_assistant/frontend/src/utils/citationParser.ts

175 lines
5.2 KiB
TypeScript

import type { SourceMetadata, SubQuestionSources } from '../types'
import { getPdfViewerUrl } from '../lib/api'
export function bulletizeMarkdown(text: string): string {
if (!text.trim()) return ''
const lines = text.split('\n')
const hasBullets = lines.some(
(line) => /^(\s*[-*+]|\s*\d+[.)]\s)/.test(line.trimStart())
)
if (hasBullets) return text
const paragraphs = text.split(/\n{2,}/).filter((p) => p.trim())
return paragraphs.map((p) => `- ${p.replace(/\n/g, ' ').trim()}`).join('\n')
}
const SUPPORTED_EXTENSIONS = /\.(pdf|docx|txt)$/i
function stripExtension(filename: string): string {
return filename.replace(SUPPORTED_EXTENSIONS, '').trim()
}
function buildCitationLookup(sources: SourceMetadata[]): Map<string, SourceMetadata> {
const lookup = new Map<string, SourceMetadata>()
for (const source of sources) {
const fname = source.filename.trim()
lookup.set(fname.toLowerCase(), source)
if (source.page_number !== null) {
lookup.set(`${fname}, page ${source.page_number}`.toLowerCase(), source)
}
const stripped = stripExtension(fname)
if (stripped !== fname) {
lookup.set(stripped.toLowerCase(), source)
if (source.page_number !== null) {
lookup.set(`${stripped}, page ${source.page_number}`.toLowerCase(), source)
}
}
}
return lookup
}
export function buildCitationLookupForSubq(
subQuestionSources: SubQuestionSources[],
subqIndex: number
): Map<string, SourceMetadata> {
const sources = subQuestionSources[subqIndex]?.sources ?? []
return buildCitationLookup(sources)
}
export function processCitationsForSubq(
answerSection: string,
subQuestionSources: SubQuestionSources[],
subqIndex: number,
highlightKeys?: Set<string>
): string {
const lookup = buildCitationLookupForSubq(subQuestionSources, subqIndex)
return replaceCitationPatterns(answerSection, lookup, highlightKeys)
}
function buildCitationUrl(source: SourceMetadata, highlightReady?: boolean): string | null {
if (highlightReady && source.document_id && source.sub_question_text) {
const v2Base = `${import.meta.env.VITE_API_BASE_URL ?? '/api/v1'}/v2`
return `${v2Base}/highlights?document_id=${encodeURIComponent(source.document_id)}&chunk_index=${source.chunk_index}&sub_question=${encodeURIComponent(source.sub_question_text)}`
}
if (source.chunk_file_path) {
return getPdfViewerUrl(
source.chunk_file_path,
source.page_number ?? undefined,
source.filename
)
}
if (source.document_id) {
return `/rag-database?document=${encodeURIComponent(source.document_id)}`
}
return null
}
function findSource(
citationText: string,
lookup: Map<string, SourceMetadata>
): SourceMetadata | undefined {
const trimmed = citationText.trim().toLowerCase()
let source = lookup.get(trimmed)
if (!source) {
const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i)
if (pageMatch) {
source = lookup.get(pageMatch[1].trim().toLowerCase())
}
}
if (!source) {
const strippedCitation = stripExtension(trimmed)
if (strippedCitation !== trimmed) {
source = lookup.get(strippedCitation)
}
}
return source
}
function replaceCitationPatterns(
text: string,
lookup: Map<string, SourceMetadata>,
highlightKeys?: Set<string>
): string {
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
let refCounter = 0
return text.replace(citationPattern, (fullMatch, content: string) => {
const trimmed = content.trim()
const source = findSource(trimmed, lookup)
if (source) {
let isReady = false
if (highlightKeys && source.document_id && source.sub_question_text) {
isReady = highlightKeys.has(
`${source.document_id}_${source.chunk_index}_${encodeURIComponent(source.sub_question_text)}`
)
}
const url = buildCitationUrl(source, isReady)
if (url) {
refCounter++
return `[${refCounter}](${url})`
}
}
return fullMatch
})
}
export function processCitations(text: string, sources: SourceMetadata[], highlightKeys?: Set<string>): string {
if (!sources.length) return text
const lookup = buildCitationLookup(sources)
return replaceCitationPatterns(text, lookup, highlightKeys)
}
export function extractCitedSources(answerText: string, sources: SourceMetadata[]): SourceMetadata[] {
if (!answerText.trim() || !sources.length) return []
const lookup = buildCitationLookup(sources)
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
const seen = new Set<string>()
const result: SourceMetadata[] = []
let match: RegExpExecArray | null
while ((match = citationPattern.exec(answerText)) !== null) {
const content = match[1].trim()
const source = findSource(content, lookup)
if (source) {
const key = `${source.document_id}_${source.chunk_index}`
if (!seen.has(key)) {
seen.add(key)
result.push(source)
}
}
}
return result
}
export function highlightTerms(markdown: string): string {
const parts = markdown.split(/(`[^`]*`)/)
return parts
.map((part, index) => {
if (index % 2 === 1) return part
return part.replace(/(?<!`)==(.+?)==(?!`)/g, '<mark class="bg-yellow-200 rounded px-0.5">$1</mark>')
})
.join('')
}