114 lines
3.0 KiB
TypeScript
114 lines
3.0 KiB
TypeScript
import type { SourceMetadata, SubQuestionSources } from '../types'
|
|
import { getPdfViewerUrl } from '../lib/api'
|
|
|
|
const SUPPORTED_EXTENSIONS = /\.(pdf|docx|txt)$/i
|
|
|
|
function stripExtension(filename: string): string {
|
|
return filename.replace(SUPPORTED_EXTENSIONS, '').trim()
|
|
}
|
|
|
|
function buildCitationLookup(sources: SourceMetadata[]): Map<string, SourceMetadata> {
|
|
const lookup = new Map<string, SourceMetadata>()
|
|
for (const source of sources) {
|
|
const fname = source.filename.trim()
|
|
|
|
lookup.set(fname.toLowerCase(), source)
|
|
if (source.page_number !== null) {
|
|
lookup.set(`${fname}, page ${source.page_number}`.toLowerCase(), source)
|
|
}
|
|
|
|
const stripped = stripExtension(fname)
|
|
if (stripped !== fname) {
|
|
lookup.set(stripped.toLowerCase(), source)
|
|
if (source.page_number !== null) {
|
|
lookup.set(`${stripped}, page ${source.page_number}`.toLowerCase(), source)
|
|
}
|
|
}
|
|
}
|
|
return lookup
|
|
}
|
|
|
|
export function buildCitationLookupForSubq(
|
|
subQuestionSources: SubQuestionSources[],
|
|
subqIndex: number
|
|
): Map<string, SourceMetadata> {
|
|
const sources = subQuestionSources[subqIndex]?.sources ?? []
|
|
return buildCitationLookup(sources)
|
|
}
|
|
|
|
export function processCitationsForSubq(
|
|
answerSection: string,
|
|
subQuestionSources: SubQuestionSources[],
|
|
subqIndex: number
|
|
): string {
|
|
const lookup = buildCitationLookupForSubq(subQuestionSources, subqIndex)
|
|
return replaceCitationPatterns(answerSection, lookup)
|
|
}
|
|
|
|
function buildCitationUrl(source: SourceMetadata): string | null {
|
|
if (source.chunk_file_path) {
|
|
return getPdfViewerUrl(
|
|
source.chunk_file_path,
|
|
source.page_number ?? undefined,
|
|
source.filename
|
|
)
|
|
}
|
|
if (source.document_id) {
|
|
return `/rag-database?document=${encodeURIComponent(source.document_id)}`
|
|
}
|
|
return null
|
|
}
|
|
|
|
function findSource(
|
|
citationText: string,
|
|
lookup: Map<string, SourceMetadata>
|
|
): SourceMetadata | undefined {
|
|
const trimmed = citationText.trim().toLowerCase()
|
|
|
|
let source = lookup.get(trimmed)
|
|
|
|
if (!source) {
|
|
const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i)
|
|
if (pageMatch) {
|
|
source = lookup.get(pageMatch[1].trim().toLowerCase())
|
|
}
|
|
}
|
|
|
|
if (!source) {
|
|
const strippedCitation = stripExtension(trimmed)
|
|
if (strippedCitation !== trimmed) {
|
|
source = lookup.get(strippedCitation)
|
|
}
|
|
}
|
|
|
|
return source
|
|
}
|
|
|
|
function replaceCitationPatterns(
|
|
text: string,
|
|
lookup: Map<string, SourceMetadata>
|
|
): string {
|
|
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
|
|
|
|
return text.replace(citationPattern, (fullMatch, content: string) => {
|
|
const trimmed = content.trim()
|
|
const source = findSource(trimmed, lookup)
|
|
|
|
if (source) {
|
|
const url = buildCitationUrl(source)
|
|
if (url) {
|
|
return `[${trimmed}](${url})`
|
|
}
|
|
}
|
|
|
|
return fullMatch
|
|
})
|
|
}
|
|
|
|
export function processCitations(text: string, sources: SourceMetadata[]): string {
|
|
if (!sources.length) return text
|
|
|
|
const lookup = buildCitationLookup(sources)
|
|
return replaceCitationPatterns(text, lookup)
|
|
}
|