From 82cc3a1d02237974cd97d1879c7309687175f8e2 Mon Sep 17 00:00:00 2001 From: Woody Date: Mon, 18 May 2026 14:10:51 +0800 Subject: [PATCH] feat: question-based chunking strategy selector in RAG Database Add ChunkingStrategy type ('token' | 'question') and wire it through the ingest pipeline. Users can now choose between traditional token-window chunking and question-based chunking (Q&A pair detection, table extraction). Frontend changes: - RAGDatabasePage: radio buttons for Token vs Question strategy - DocumentList: strategy badges (blue 'chunked by question' / gray 'chunked by token') - ChunkList: question-strategy chunks show Q&A metadata (question ID, topic, page range, 'contains table' badge) instead of raw page numbers - api.ts / queries.tsx: pass strategy param to /ingest endpoint - types/index.ts: new ChunkingStrategy type, new fields on ChunkInfo, DocumentInfo, IngestResponse --- frontend/src/components/ChunkList.tsx | 36 ++++++++++++++++--- frontend/src/components/DocumentList.tsx | 13 ++++++- frontend/src/lib/api.ts | 6 ++-- frontend/src/lib/queries.tsx | 6 ++-- frontend/src/pages/RAGDatabasePage.tsx | 44 ++++++++++++++++++++++-- frontend/src/types/index.ts | 11 ++++++ 6 files changed, 102 insertions(+), 14 deletions(-) diff --git a/frontend/src/components/ChunkList.tsx b/frontend/src/components/ChunkList.tsx index 4d9faee..2ab0b39 100644 --- a/frontend/src/components/ChunkList.tsx +++ b/frontend/src/components/ChunkList.tsx @@ -56,9 +56,32 @@ export const ChunkList: React.FC = ({ Chunk {chunk.chunk_index} - - Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'} - + {chunk.strategy_type === 'question' && chunk.question_id ? ( + <> + + Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''} + + {chunk.topic_section && ( + + Topic: {chunk.topic_section} + + )} + {chunk.source_page_range && chunk.source_page_range.length === 2 && ( + + Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]} + + )} + {chunk.has_table && ( + + Contains table + + )} + + ) : ( + + Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'} + + )}
{chunk.content_summary.length > 100 @@ -67,7 +90,12 @@ export const ChunkList: React.FC = ({
{chunk.chunk_file_path && ( 0 + ? chunk.source_page_range[0] + : chunk.page_number ?? undefined + )} target="_blank" rel="noopener noreferrer" className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline" diff --git a/frontend/src/components/DocumentList.tsx b/frontend/src/components/DocumentList.tsx index f257c86..ec98c49 100644 --- a/frontend/src/components/DocumentList.tsx +++ b/frontend/src/components/DocumentList.tsx @@ -29,7 +29,18 @@ export const DocumentList: React.FC = ({
-
{doc.filename}
+
+ {doc.filename} + {doc.chunking_strategy === 'question' ? ( + + chunked by question + + ) : ( + + chunked by token + + )} +
{doc.chunk_count} chunks • Uploaded {doc.upload_date}
diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 39ad983..65f09dd 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -1,5 +1,5 @@ import axios from 'axios' -import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types' +import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types' const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1' @@ -48,10 +48,10 @@ export const queryDocumentStream = async ( } } -export const ingestDocument = async (file: File): Promise => { +export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise => { const form = new FormData() form.append('file', file) - const resp = await apiClient.post('/ingest', form, { + const resp = await apiClient.post(`/ingest?strategy=${strategy}`, form, { headers: { 'Content-Type': 'multipart/form-data' }, }) return resp.data diff --git a/frontend/src/lib/queries.tsx b/frontend/src/lib/queries.tsx index 27ba71b..bf02227 100644 --- a/frontend/src/lib/queries.tsx +++ b/frontend/src/lib/queries.tsx @@ -1,7 +1,7 @@ import React from 'react' import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query' import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api' -import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types' +import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types' import { useState, useCallback, useRef } from 'react' export const queryClient = new QueryClient() @@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => { } export const useIngestDocument = () => { - return useMutation({ - mutationFn: ingestDocument, + return useMutation({ + mutationFn: ({ file, strategy }) => ingestDocument(file, strategy), }) } diff --git a/frontend/src/pages/RAGDatabasePage.tsx b/frontend/src/pages/RAGDatabasePage.tsx index 5cba000..f9e5dff 100644 --- a/frontend/src/pages/RAGDatabasePage.tsx +++ b/frontend/src/pages/RAGDatabasePage.tsx @@ -1,10 +1,11 @@ import React, { useState, useCallback, useMemo } from 'react' -import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react' +import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react' import { useQueryClient } from '@tanstack/react-query' import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries' import { DocumentList } from '../components/DocumentList' import { ChunkList } from '../components/ChunkList' import { DocumentUpload } from '../components/DocumentUpload' +import type { ChunkingStrategy } from '../types' interface FileUploadEntry { name: string @@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => { const initialDocId = useMemo(() => getDocumentIdFromUrl(), []) const [expandedId, setExpandedId] = useState(initialDocId) const [uploadEntries, setUploadEntries] = useState([]) + const [chunkingStrategy, setChunkingStrategy] = useState('token') const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments() const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId) @@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => { const results = await Promise.allSettled( files.map(async (file) => { try { - await ingestDocumentMutation.mutateAsync(file) + await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy }) setUploadEntries((prev) => prev.map((e) => e.name === file.name ? { ...e, status: 'success' as const } : e @@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => { queryClient.invalidateQueries({ queryKey: ['documents'] }) setTimeout(() => setUploadEntries([]), 5000) - }, [ingestDocumentMutation, queryClient]) + }, [ingestDocumentMutation, queryClient, chunkingStrategy]) const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length const successCount = uploadEntries.filter((e) => e.status === 'success').length @@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => { />
+
+ Chunking strategy: +
+ + +
+
+ {hasEntries && (
diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 6f87321..36c9bb7 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -1,3 +1,5 @@ +export type ChunkingStrategy = 'token' | 'question' + export interface SourceMetadata { filename: string upload_date: string @@ -40,6 +42,7 @@ export interface IngestResponse { document_id: string chunk_count: number filename: string + strategy: ChunkingStrategy } export interface DocumentInfo { @@ -47,6 +50,7 @@ export interface DocumentInfo { filename: string chunk_count: number upload_date: string + chunking_strategy: ChunkingStrategy } export interface ChunkInfo { @@ -55,6 +59,13 @@ export interface ChunkInfo { content_summary: string page_number: number | null chunk_file_path: string | null + strategy_type: ChunkingStrategy + question_index: number | null + question_id: string | null + question_text: string | null + topic_section: string | null + source_page_range: number[] | null + has_table: boolean | null } export interface DocumentListResponse {