feat: question-based chunking strategy selector in RAG Database
Add ChunkingStrategy type ('token' | 'question') and wire it through
the ingest pipeline. Users can now choose between traditional token-window
chunking and question-based chunking (Q&A pair detection, table extraction).
Frontend changes:
- RAGDatabasePage: radio buttons for Token vs Question strategy
- DocumentList: strategy badges (blue 'chunked by question' / gray 'chunked by token')
- ChunkList: question-strategy chunks show Q&A metadata (question ID, topic,
page range, 'contains table' badge) instead of raw page numbers
- api.ts / queries.tsx: pass strategy param to /ingest endpoint
- types/index.ts: new ChunkingStrategy type, new fields on ChunkInfo,
DocumentInfo, IngestResponse
This commit is contained in:
parent
62db325f02
commit
82cc3a1d02
|
|
@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
|
|||
<span className="text-xs font-medium text-gray-500 uppercase">
|
||||
Chunk {chunk.chunk_index}
|
||||
</span>
|
||||
<span className="text-xs text-gray-400">
|
||||
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
|
||||
</span>
|
||||
{chunk.strategy_type === 'question' && chunk.question_id ? (
|
||||
<>
|
||||
<span className="text-xs text-gray-600">
|
||||
Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
|
||||
</span>
|
||||
{chunk.topic_section && (
|
||||
<span className="text-xs text-gray-500">
|
||||
Topic: {chunk.topic_section}
|
||||
</span>
|
||||
)}
|
||||
{chunk.source_page_range && chunk.source_page_range.length === 2 && (
|
||||
<span className="text-xs text-gray-400">
|
||||
Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
|
||||
</span>
|
||||
)}
|
||||
{chunk.has_table && (
|
||||
<span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
|
||||
Contains table
|
||||
</span>
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
<span className="text-xs text-gray-400">
|
||||
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
|
||||
{chunk.content_summary.length > 100
|
||||
|
|
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
|
|||
</div>
|
||||
{chunk.chunk_file_path && (
|
||||
<a
|
||||
href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)}
|
||||
href={getPdfViewerUrl(
|
||||
chunk.chunk_file_path,
|
||||
chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
|
||||
? chunk.source_page_range[0]
|
||||
: chunk.page_number ?? undefined
|
||||
)}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"
|
||||
|
|
|
|||
|
|
@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
|
|||
<div className="flex items-center space-x-3 flex-1">
|
||||
<FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium text-gray-900 truncate">{doc.filename}</div>
|
||||
<div className="flex items-center space-x-2">
|
||||
<span className="font-medium text-gray-900 truncate">{doc.filename}</span>
|
||||
{doc.chunking_strategy === 'question' ? (
|
||||
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
|
||||
chunked by question
|
||||
</span>
|
||||
) : (
|
||||
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
|
||||
chunked by token
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-sm text-gray-500">
|
||||
{doc.chunk_count} chunks • Uploaded {doc.upload_date}
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import axios from 'axios'
|
||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
|
||||
import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
|
||||
|
||||
const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
|
||||
|
||||
|
|
@ -48,10 +48,10 @@ export const queryDocumentStream = async (
|
|||
}
|
||||
}
|
||||
|
||||
export const ingestDocument = async (file: File): Promise<IngestResponse> => {
|
||||
export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
|
||||
const form = new FormData()
|
||||
form.append('file', file)
|
||||
const resp = await apiClient.post<IngestResponse>('/ingest', form, {
|
||||
const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
|
||||
headers: { 'Content-Type': 'multipart/form-data' },
|
||||
})
|
||||
return resp.data
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import React from 'react'
|
||||
import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
|
||||
import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
|
||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
|
||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
|
||||
import { useState, useCallback, useRef } from 'react'
|
||||
|
||||
export const queryClient = new QueryClient()
|
||||
|
|
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
|
|||
}
|
||||
|
||||
export const useIngestDocument = () => {
|
||||
return useMutation<IngestResponse, Error, File>({
|
||||
mutationFn: ingestDocument,
|
||||
return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
|
||||
mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
import React, { useState, useCallback, useMemo } from 'react'
|
||||
import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react'
|
||||
import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
|
||||
import { useQueryClient } from '@tanstack/react-query'
|
||||
import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
|
||||
import { DocumentList } from '../components/DocumentList'
|
||||
import { ChunkList } from '../components/ChunkList'
|
||||
import { DocumentUpload } from '../components/DocumentUpload'
|
||||
import type { ChunkingStrategy } from '../types'
|
||||
|
||||
interface FileUploadEntry {
|
||||
name: string
|
||||
|
|
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
|
||||
const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
|
||||
const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
|
||||
const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')
|
||||
|
||||
const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
|
||||
const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
|
||||
|
|
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
const results = await Promise.allSettled(
|
||||
files.map(async (file) => {
|
||||
try {
|
||||
await ingestDocumentMutation.mutateAsync(file)
|
||||
await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
|
||||
setUploadEntries((prev) =>
|
||||
prev.map((e) =>
|
||||
e.name === file.name ? { ...e, status: 'success' as const } : e
|
||||
|
|
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
|
||||
queryClient.invalidateQueries({ queryKey: ['documents'] })
|
||||
setTimeout(() => setUploadEntries([]), 5000)
|
||||
}, [ingestDocumentMutation, queryClient])
|
||||
}, [ingestDocumentMutation, queryClient, chunkingStrategy])
|
||||
|
||||
const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
|
||||
const successCount = uploadEntries.filter((e) => e.status === 'success').length
|
||||
|
|
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
/>
|
||||
</div>
|
||||
|
||||
<div className="mt-3 flex items-center space-x-4">
|
||||
<span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
|
||||
<div className="flex items-center space-x-3">
|
||||
<label className="flex items-center space-x-2 cursor-pointer">
|
||||
<input
|
||||
type="radio"
|
||||
name="chunking-strategy"
|
||||
value="token"
|
||||
checked={chunkingStrategy === 'token'}
|
||||
onChange={() => setChunkingStrategy('token')}
|
||||
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
|
||||
/>
|
||||
<Type className="w-4 h-4 text-gray-500" />
|
||||
<div>
|
||||
<span className="text-sm font-medium text-gray-900">Chunk by Token</span>
|
||||
<span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
|
||||
</div>
|
||||
</label>
|
||||
<label className="flex items-center space-x-2 cursor-pointer">
|
||||
<input
|
||||
type="radio"
|
||||
name="chunking-strategy"
|
||||
value="question"
|
||||
checked={chunkingStrategy === 'question'}
|
||||
onChange={() => setChunkingStrategy('question')}
|
||||
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
|
||||
/>
|
||||
<MessageSquare className="w-4 h-4 text-gray-500" />
|
||||
<div>
|
||||
<span className="text-sm font-medium text-gray-900">Chunk by Question</span>
|
||||
<span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
|
||||
</div>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{hasEntries && (
|
||||
<div className="mt-4 space-y-2">
|
||||
<div className="text-sm font-medium text-gray-600">
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
export type ChunkingStrategy = 'token' | 'question'
|
||||
|
||||
export interface SourceMetadata {
|
||||
filename: string
|
||||
upload_date: string
|
||||
|
|
@ -40,6 +42,7 @@ export interface IngestResponse {
|
|||
document_id: string
|
||||
chunk_count: number
|
||||
filename: string
|
||||
strategy: ChunkingStrategy
|
||||
}
|
||||
|
||||
export interface DocumentInfo {
|
||||
|
|
@ -47,6 +50,7 @@ export interface DocumentInfo {
|
|||
filename: string
|
||||
chunk_count: number
|
||||
upload_date: string
|
||||
chunking_strategy: ChunkingStrategy
|
||||
}
|
||||
|
||||
export interface ChunkInfo {
|
||||
|
|
@ -55,6 +59,13 @@ export interface ChunkInfo {
|
|||
content_summary: string
|
||||
page_number: number | null
|
||||
chunk_file_path: string | null
|
||||
strategy_type: ChunkingStrategy
|
||||
question_index: number | null
|
||||
question_id: string | null
|
||||
question_text: string | null
|
||||
topic_section: string | null
|
||||
source_page_range: number[] | null
|
||||
has_table: boolean | null
|
||||
}
|
||||
|
||||
export interface DocumentListResponse {
|
||||
|
|
|
|||
Loading…
Reference in New Issue