feat: question-based chunking strategy selector in RAG Database

Add ChunkingStrategy type ('token' | 'question') and wire it through
the ingest pipeline. Users can now choose between traditional token-window
chunking and question-based chunking (Q&A pair detection, table extraction).

Frontend changes:
- RAGDatabasePage: radio buttons for Token vs Question strategy
- DocumentList: strategy badges (blue 'chunked by question' / gray 'chunked by token')
- ChunkList: question-strategy chunks show Q&A metadata (question ID, topic,
  page range, 'contains table' badge) instead of raw page numbers
- api.ts / queries.tsx: pass strategy param to /ingest endpoint
- types/index.ts: new ChunkingStrategy type, new fields on ChunkInfo,
  DocumentInfo, IngestResponse
This commit is contained in:
Woody 2026-05-18 14:10:51 +08:00
parent 62db325f02
commit 82cc3a1d02
6 changed files with 102 additions and 14 deletions

View File

@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
<span className="text-xs font-medium text-gray-500 uppercase"> <span className="text-xs font-medium text-gray-500 uppercase">
Chunk {chunk.chunk_index} Chunk {chunk.chunk_index}
</span> </span>
<span className="text-xs text-gray-400"> {chunk.strategy_type === 'question' && chunk.question_id ? (
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'} <>
</span> <span className="text-xs text-gray-600">
Q: {chunk.question_id}{chunk.question_text ? `${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
</span>
{chunk.topic_section && (
<span className="text-xs text-gray-500">
Topic: {chunk.topic_section}
</span>
)}
{chunk.source_page_range && chunk.source_page_range.length === 2 && (
<span className="text-xs text-gray-400">
Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
</span>
)}
{chunk.has_table && (
<span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
Contains table
</span>
)}
</>
) : (
<span className="text-xs text-gray-400">
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
</span>
)}
</div> </div>
<div className="text-sm text-gray-700 truncate" title={chunk.content_summary}> <div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
{chunk.content_summary.length > 100 {chunk.content_summary.length > 100
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
</div> </div>
{chunk.chunk_file_path && ( {chunk.chunk_file_path && (
<a <a
href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)} href={getPdfViewerUrl(
chunk.chunk_file_path,
chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
? chunk.source_page_range[0]
: chunk.page_number ?? undefined
)}
target="_blank" target="_blank"
rel="noopener noreferrer" rel="noopener noreferrer"
className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline" className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"

View File

@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
<div className="flex items-center space-x-3 flex-1"> <div className="flex items-center space-x-3 flex-1">
<FileText className="w-5 h-5 text-gray-500 flex-shrink-0" /> <FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
<div className="flex-1 min-w-0"> <div className="flex-1 min-w-0">
<div className="font-medium text-gray-900 truncate">{doc.filename}</div> <div className="flex items-center space-x-2">
<span className="font-medium text-gray-900 truncate">{doc.filename}</span>
{doc.chunking_strategy === 'question' ? (
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
chunked by question
</span>
) : (
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
chunked by token
</span>
)}
</div>
<div className="text-sm text-gray-500"> <div className="text-sm text-gray-500">
{doc.chunk_count} chunks Uploaded {doc.upload_date} {doc.chunk_count} chunks Uploaded {doc.upload_date}
</div> </div>

View File

@ -1,5 +1,5 @@
import axios from 'axios' import axios from 'axios'
import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types' import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1' const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
@ -48,10 +48,10 @@ export const queryDocumentStream = async (
} }
} }
export const ingestDocument = async (file: File): Promise<IngestResponse> => { export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
const form = new FormData() const form = new FormData()
form.append('file', file) form.append('file', file)
const resp = await apiClient.post<IngestResponse>('/ingest', form, { const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
headers: { 'Content-Type': 'multipart/form-data' }, headers: { 'Content-Type': 'multipart/form-data' },
}) })
return resp.data return resp.data

View File

@ -1,7 +1,7 @@
import React from 'react' import React from 'react'
import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query' import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api' import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types' import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
import { useState, useCallback, useRef } from 'react' import { useState, useCallback, useRef } from 'react'
export const queryClient = new QueryClient() export const queryClient = new QueryClient()
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
} }
export const useIngestDocument = () => { export const useIngestDocument = () => {
return useMutation<IngestResponse, Error, File>({ return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
mutationFn: ingestDocument, mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
}) })
} }

View File

@ -1,10 +1,11 @@
import React, { useState, useCallback, useMemo } from 'react' import React, { useState, useCallback, useMemo } from 'react'
import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react' import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
import { useQueryClient } from '@tanstack/react-query' import { useQueryClient } from '@tanstack/react-query'
import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries' import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
import { DocumentList } from '../components/DocumentList' import { DocumentList } from '../components/DocumentList'
import { ChunkList } from '../components/ChunkList' import { ChunkList } from '../components/ChunkList'
import { DocumentUpload } from '../components/DocumentUpload' import { DocumentUpload } from '../components/DocumentUpload'
import type { ChunkingStrategy } from '../types'
interface FileUploadEntry { interface FileUploadEntry {
name: string name: string
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
const initialDocId = useMemo(() => getDocumentIdFromUrl(), []) const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
const [expandedId, setExpandedId] = useState<string | null>(initialDocId) const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([]) const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')
const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments() const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId) const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
const results = await Promise.allSettled( const results = await Promise.allSettled(
files.map(async (file) => { files.map(async (file) => {
try { try {
await ingestDocumentMutation.mutateAsync(file) await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
setUploadEntries((prev) => setUploadEntries((prev) =>
prev.map((e) => prev.map((e) =>
e.name === file.name ? { ...e, status: 'success' as const } : e e.name === file.name ? { ...e, status: 'success' as const } : e
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {
queryClient.invalidateQueries({ queryKey: ['documents'] }) queryClient.invalidateQueries({ queryKey: ['documents'] })
setTimeout(() => setUploadEntries([]), 5000) setTimeout(() => setUploadEntries([]), 5000)
}, [ingestDocumentMutation, queryClient]) }, [ingestDocumentMutation, queryClient, chunkingStrategy])
const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
const successCount = uploadEntries.filter((e) => e.status === 'success').length const successCount = uploadEntries.filter((e) => e.status === 'success').length
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
/> />
</div> </div>
<div className="mt-3 flex items-center space-x-4">
<span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
<div className="flex items-center space-x-3">
<label className="flex items-center space-x-2 cursor-pointer">
<input
type="radio"
name="chunking-strategy"
value="token"
checked={chunkingStrategy === 'token'}
onChange={() => setChunkingStrategy('token')}
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
/>
<Type className="w-4 h-4 text-gray-500" />
<div>
<span className="text-sm font-medium text-gray-900">Chunk by Token</span>
<span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
</div>
</label>
<label className="flex items-center space-x-2 cursor-pointer">
<input
type="radio"
name="chunking-strategy"
value="question"
checked={chunkingStrategy === 'question'}
onChange={() => setChunkingStrategy('question')}
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
/>
<MessageSquare className="w-4 h-4 text-gray-500" />
<div>
<span className="text-sm font-medium text-gray-900">Chunk by Question</span>
<span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
</div>
</label>
</div>
</div>
{hasEntries && ( {hasEntries && (
<div className="mt-4 space-y-2"> <div className="mt-4 space-y-2">
<div className="text-sm font-medium text-gray-600"> <div className="text-sm font-medium text-gray-600">

View File

@ -1,3 +1,5 @@
export type ChunkingStrategy = 'token' | 'question'
export interface SourceMetadata { export interface SourceMetadata {
filename: string filename: string
upload_date: string upload_date: string
@ -40,6 +42,7 @@ export interface IngestResponse {
document_id: string document_id: string
chunk_count: number chunk_count: number
filename: string filename: string
strategy: ChunkingStrategy
} }
export interface DocumentInfo { export interface DocumentInfo {
@ -47,6 +50,7 @@ export interface DocumentInfo {
filename: string filename: string
chunk_count: number chunk_count: number
upload_date: string upload_date: string
chunking_strategy: ChunkingStrategy
} }
export interface ChunkInfo { export interface ChunkInfo {
@ -55,6 +59,13 @@ export interface ChunkInfo {
content_summary: string content_summary: string
page_number: number | null page_number: number | null
chunk_file_path: string | null chunk_file_path: string | null
strategy_type: ChunkingStrategy
question_index: number | null
question_id: string | null
question_text: string | null
topic_section: string | null
source_page_range: number[] | null
has_table: boolean | null
} }
export interface DocumentListResponse { export interface DocumentListResponse {