175 lines
5.2 KiB
TypeScript
175 lines
5.2 KiB
TypeScript
import type { SourceMetadata, SubQuestionSources } from '../types'
|
|
import { getPdfViewerUrl } from '../lib/api'
|
|
|
|
export function bulletizeMarkdown(text: string): string {
|
|
if (!text.trim()) return ''
|
|
|
|
const lines = text.split('\n')
|
|
const hasBullets = lines.some(
|
|
(line) => /^(\s*[-*+]|\s*\d+[.)]\s)/.test(line.trimStart())
|
|
)
|
|
if (hasBullets) return text
|
|
|
|
const paragraphs = text.split(/\n{2,}/).filter((p) => p.trim())
|
|
return paragraphs.map((p) => `- ${p.replace(/\n/g, ' ').trim()}`).join('\n')
|
|
}
|
|
|
|
const SUPPORTED_EXTENSIONS = /\.(pdf|docx|txt)$/i
|
|
|
|
function stripExtension(filename: string): string {
|
|
return filename.replace(SUPPORTED_EXTENSIONS, '').trim()
|
|
}
|
|
|
|
function buildCitationLookup(sources: SourceMetadata[]): Map<string, SourceMetadata> {
|
|
const lookup = new Map<string, SourceMetadata>()
|
|
for (const source of sources) {
|
|
const fname = source.filename.trim()
|
|
|
|
lookup.set(fname.toLowerCase(), source)
|
|
if (source.page_number !== null) {
|
|
lookup.set(`${fname}, page ${source.page_number}`.toLowerCase(), source)
|
|
}
|
|
|
|
const stripped = stripExtension(fname)
|
|
if (stripped !== fname) {
|
|
lookup.set(stripped.toLowerCase(), source)
|
|
if (source.page_number !== null) {
|
|
lookup.set(`${stripped}, page ${source.page_number}`.toLowerCase(), source)
|
|
}
|
|
}
|
|
}
|
|
return lookup
|
|
}
|
|
|
|
export function buildCitationLookupForSubq(
|
|
subQuestionSources: SubQuestionSources[],
|
|
subqIndex: number
|
|
): Map<string, SourceMetadata> {
|
|
const sources = subQuestionSources[subqIndex]?.sources ?? []
|
|
return buildCitationLookup(sources)
|
|
}
|
|
|
|
export function processCitationsForSubq(
|
|
answerSection: string,
|
|
subQuestionSources: SubQuestionSources[],
|
|
subqIndex: number,
|
|
highlightKeys?: Set<string>
|
|
): string {
|
|
const lookup = buildCitationLookupForSubq(subQuestionSources, subqIndex)
|
|
return replaceCitationPatterns(answerSection, lookup, highlightKeys)
|
|
}
|
|
|
|
function buildCitationUrl(source: SourceMetadata, highlightReady?: boolean): string | null {
|
|
if (highlightReady && source.document_id && source.sub_question_text) {
|
|
const v2Base = `${import.meta.env.VITE_API_BASE_URL ?? '/api/v1'}/v2`
|
|
return `${v2Base}/highlights?document_id=${encodeURIComponent(source.document_id)}&chunk_index=${source.chunk_index}&sub_question=${encodeURIComponent(source.sub_question_text)}`
|
|
}
|
|
if (source.chunk_file_path) {
|
|
return getPdfViewerUrl(
|
|
source.chunk_file_path,
|
|
source.page_number ?? undefined,
|
|
source.filename
|
|
)
|
|
}
|
|
if (source.document_id) {
|
|
return `/rag-database?document=${encodeURIComponent(source.document_id)}`
|
|
}
|
|
return null
|
|
}
|
|
|
|
function findSource(
|
|
citationText: string,
|
|
lookup: Map<string, SourceMetadata>
|
|
): SourceMetadata | undefined {
|
|
const trimmed = citationText.trim().toLowerCase()
|
|
|
|
let source = lookup.get(trimmed)
|
|
|
|
if (!source) {
|
|
const pageMatch = trimmed.match(/^(.+?),\s*page\s+(\d+)$/i)
|
|
if (pageMatch) {
|
|
source = lookup.get(pageMatch[1].trim().toLowerCase())
|
|
}
|
|
}
|
|
|
|
if (!source) {
|
|
const strippedCitation = stripExtension(trimmed)
|
|
if (strippedCitation !== trimmed) {
|
|
source = lookup.get(strippedCitation)
|
|
}
|
|
}
|
|
|
|
return source
|
|
}
|
|
|
|
function replaceCitationPatterns(
|
|
text: string,
|
|
lookup: Map<string, SourceMetadata>,
|
|
highlightKeys?: Set<string>
|
|
): string {
|
|
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
|
|
let refCounter = 0
|
|
|
|
return text.replace(citationPattern, (fullMatch, content: string) => {
|
|
const trimmed = content.trim()
|
|
const source = findSource(trimmed, lookup)
|
|
|
|
if (source) {
|
|
let isReady = false
|
|
if (highlightKeys && source.document_id && source.sub_question_text) {
|
|
isReady = highlightKeys.has(
|
|
`${source.document_id}_${source.chunk_index}_${encodeURIComponent(source.sub_question_text)}`
|
|
)
|
|
}
|
|
const url = buildCitationUrl(source, isReady)
|
|
if (url) {
|
|
refCounter++
|
|
return `[${refCounter}](${url})`
|
|
}
|
|
}
|
|
|
|
return fullMatch
|
|
})
|
|
}
|
|
|
|
export function processCitations(text: string, sources: SourceMetadata[], highlightKeys?: Set<string>): string {
|
|
if (!sources.length) return text
|
|
|
|
const lookup = buildCitationLookup(sources)
|
|
return replaceCitationPatterns(text, lookup, highlightKeys)
|
|
}
|
|
|
|
export function extractCitedSources(answerText: string, sources: SourceMetadata[]): SourceMetadata[] {
|
|
if (!answerText.trim() || !sources.length) return []
|
|
|
|
const lookup = buildCitationLookup(sources)
|
|
const citationPattern = /(?<!!)\[([^\]]+)\](?!\()/g
|
|
const seen = new Set<string>()
|
|
const result: SourceMetadata[] = []
|
|
|
|
let match: RegExpExecArray | null
|
|
while ((match = citationPattern.exec(answerText)) !== null) {
|
|
const content = match[1].trim()
|
|
const source = findSource(content, lookup)
|
|
if (source) {
|
|
const key = `${source.document_id}_${source.chunk_index}`
|
|
if (!seen.has(key)) {
|
|
seen.add(key)
|
|
result.push(source)
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
export function highlightTerms(markdown: string): string {
|
|
const parts = markdown.split(/(`[^`]*`)/)
|
|
return parts
|
|
.map((part, index) => {
|
|
if (index % 2 === 1) return part
|
|
return part.replace(/(?<!`)==(.+?)==(?!`)/g, '<mark class="bg-yellow-200 rounded px-0.5">$1</mark>')
|
|
})
|
|
.join('')
|
|
}
|