legco_ai_assistant/frontend/src/utils/citationParser.ts

114 lines
3.0 KiB
TypeScript

import type { SourceMetadata, SubQuestionSources } from '../types'
import { getPdfViewerUrl } from '../lib/api'
const SUPPORTED_EXTENSIONS = /\.(pdf|docx|txt)$/i
function stripExtension(filename: string): string {
return filename.replace(SUPPORTED_EXTENSIONS, '').trim()
}
function buildCitationLookup(sources: SourceMetadata[]): Map<string, SourceMetadata> {
const lookup = new Map<string, SourceMetadata>()
for (const source of sources) {
const fname = source.filename.trim()
lookup.set(fname.toLowerCase(), source)
if (source.page_number !== null) {
lookup.set(`${fname}, page ${source.page_number}`.toLowerCase(), source)
}
const stripped = stripExtension(fname)
if (stripped !== fname) {
lookup.set(stripped.toLowerCase(), source)
if (source.page_number !== null) {
lookup.set(`${stripped}, page ${source.page_number}`.toLowerCase(), source)
}
}
}
return lookup
}
export function buildCitationLookupForSubq(
subQuestionSources: SubQuestionSources[],
subqIndex: number
): Map<string, SourceMetadata> {
const sources = subQuestionSources[subqIndex]?.sources ?? []
return buildCitationLookup(sources)
}
export function processCitationsForSubq(
answerSection: string,
subQuestionSources: SubQuestionSources[],
subqIndex: number
): string {
const lookup = buildCitationLookupForSubq(subQuestionSources, subqIndex)
return replaceCitationPatterns(answerSection, lookup)
}
function buildCitationUrl(source: SourceMetadata): string | null {
if (source.chunk_file_path) {
return getPdfViewerUrl(
source.chunk_file_path,
source.page_number ?? undefined,
source.filename
)
}
if (source.document_id) {
return `/rag-database?document=${encodeURIComponent(source.document_id)}`
}
return null
}
function findSource(
citationText: string,
lookup: Map<string, SourceMetadata>
): SourceMetadata | undefined {
const trimmed = citationText.trim().toLowerCase()
let source = lookup.get(trimmed)
if (!source) {
const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i)
if (pageMatch) {
source = lookup.get(pageMatch[1].trim().toLowerCase())
}
}
if (!source) {
const strippedCitation = stripExtension(trimmed)
if (strippedCitation !== trimmed) {
source = lookup.get(strippedCitation)
}
}
return source
}
function replaceCitationPatterns(
text: string,
lookup: Map<string, SourceMetadata>
): string {
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
return text.replace(citationPattern, (fullMatch, content: string) => {
const trimmed = content.trim()
const source = findSource(trimmed, lookup)
if (source) {
const url = buildCitationUrl(source)
if (url) {
return `[${trimmed}](${url})`
}
}
return fullMatch
})
}
export function processCitations(text: string, sources: SourceMetadata[]): string {
if (!sources.length) return text
const lookup = buildCitationLookup(sources)
return replaceCitationPatterns(text, lookup)
}