diff --git a/.plans/package8_enhancement_plan.md b/.plans/package8_enhancement_plan.md index 79391e3..544faea 100644 --- a/.plans/package8_enhancement_plan.md +++ b/.plans/package8_enhancement_plan.md @@ -327,7 +327,7 @@ For each section in the JSON response: If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when: - No regex pattern matches (unknown format) - Regex produces < 2 sections (likely misdetection) -- `qa_verification_model` is not set to `"none"` +- `qa_structure_model` is not set to `"none"` ### Algorithm Detail: Table-to-Markdown @@ -382,7 +382,7 @@ class Settings(BaseSettings): # NEW: Q&A chunking config qa_vision_enabled: bool = True # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME) qa_max_chunk_tokens: int = 3000 # Max tokens per Q&A chunk (before forced split) - qa_verification_model: str = "" # LLM for boundary verification (empty = use LLM_MODEL_NAME) + qa_structure_model: str = "" # LLM for structure detection (empty = use LLM_MODEL_NAME) qa_include_internal_refs: bool = True # Include [ć…§éƒšćƒè€ƒ] in chunks qa_cache_vision_results: bool = True # Cache vision results per page @@ -390,7 +390,7 @@ class Settings(BaseSettings): # DEFAULT_CHUNKING_STRATEGY=token # QA_VISION_ENABLED=true # QA_MAX_CHUNK_TOKENS=3000 - # QA_VERIFICATION_MODEL= + # QA_STRUCTURE_MODEL= # QA_INCLUDE_INTERNAL_REFS=true # QA_CACHE_VISION_RESULTS=true diff --git a/backend/.env.example b/backend/.env.example index c6466db..499ded6 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -41,3 +41,11 @@ MAX_VIDEO_SIZE_MB=300 # Set to false to disable System Audio or Listen Mic capture SYSTEM_AUDIO_ENABLED=true MIC_ENABLED=true + +# Q&A-pair chunking (Package 8) +DEFAULT_CHUNKING_STRATEGY=token +QA_VISION_ENABLED=true +QA_MAX_CHUNK_TOKENS=3000 +QA_STRUCTURE_MODEL= +QA_INCLUDE_INTERNAL_REFS=true +QA_CACHE_VISION_RESULTS=true diff --git a/backend/app/core/config.py b/backend/app/core/config.py index d024928..f19349a 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -44,6 +44,14 @@ class Settings(BaseSettings): relevance_threshold: float = 7.0 llm_timeout: float = 60.0 + # Q&A-pair chunking strategy (Package 8) + default_chunking_strategy: str = "token" + qa_vision_enabled: bool = True + qa_max_chunk_tokens: int = 3000 + qa_structure_model: str = "" + qa_include_internal_refs: bool = True + qa_cache_vision_results: bool = True + # Alibaba Cloud DashScope ASR (Phase 2) dashscope_api_key: str = "" asr_model_name: str = "qwen3-asr-flash" diff --git a/backend/app/models/ingest.py b/backend/app/models/ingest.py index 3b54531..f501c87 100644 --- a/backend/app/models/ingest.py +++ b/backend/app/models/ingest.py @@ -1,7 +1,18 @@ +from typing import Literal + from pydantic import BaseModel +ChunkingStrategyType = Literal["token", "question"] + +VALID_CHUNKING_STRATEGIES = frozenset({"token", "question"}) + + +class IngestRequest(BaseModel): + strategy: ChunkingStrategyType = "token" + class IngestResponse(BaseModel): document_id: str chunk_count: int filename: str + strategy: ChunkingStrategyType = "token" diff --git a/backend/app/test/test_phase1_config.py b/backend/app/test/test_phase1_config.py index 37dbac2..26b7087 100644 --- a/backend/app/test/test_phase1_config.py +++ b/backend/app/test/test_phase1_config.py @@ -31,3 +31,47 @@ def test_config_default_values(monkeypatch): settings = Settings() assert settings.llm_base_url == "https://openrouter.ai/api/v1" assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b" + + +def test_qa_chunking_config_defaults(monkeypatch): + """Phase 8.0: Q&A chunking config fields have correct defaults.""" + monkeypatch.delenv("DEFAULT_CHUNKING_STRATEGY", raising=False) + monkeypatch.delenv("QA_VISION_ENABLED", raising=False) + monkeypatch.delenv("QA_MAX_CHUNK_TOKENS", raising=False) + monkeypatch.delenv("QA_STRUCTURE_MODEL", raising=False) + monkeypatch.delenv("QA_INCLUDE_INTERNAL_REFS", raising=False) + monkeypatch.delenv("QA_CACHE_VISION_RESULTS", raising=False) + + from app.core.config import Settings + + settings = Settings() + assert settings.default_chunking_strategy == "token" + assert settings.qa_vision_enabled is True + assert settings.qa_max_chunk_tokens == 3000 + assert settings.qa_structure_model == "" + assert settings.qa_include_internal_refs is True + assert settings.qa_cache_vision_results is True + + +def test_qa_chunking_config_from_env(tmp_path, monkeypatch): + """Phase 8.0: Q&A chunking config fields load from .env.""" + env_file = tmp_path / ".env" + env_file.write_text( + "DEFAULT_CHUNKING_STRATEGY=question\n" + "QA_VISION_ENABLED=false\n" + "QA_MAX_CHUNK_TOKENS=5000\n" + "QA_STRUCTURE_MODEL=anthropic/claude-3-haiku\n" + "QA_INCLUDE_INTERNAL_REFS=false\n" + "QA_CACHE_VISION_RESULTS=false\n" + ) + + monkeypatch.chdir(tmp_path) + from app.core.config import Settings + + settings = Settings() + assert settings.default_chunking_strategy == "question" + assert settings.qa_vision_enabled is False + assert settings.qa_max_chunk_tokens == 5000 + assert settings.qa_structure_model == "anthropic/claude-3-haiku" + assert settings.qa_include_internal_refs is False + assert settings.qa_cache_vision_results is False diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 6f87321..36c9bb7 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -1,3 +1,5 @@ +export type ChunkingStrategy = 'token' | 'question' + export interface SourceMetadata { filename: string upload_date: string @@ -40,6 +42,7 @@ export interface IngestResponse { document_id: string chunk_count: number filename: string + strategy: ChunkingStrategy } export interface DocumentInfo { @@ -47,6 +50,7 @@ export interface DocumentInfo { filename: string chunk_count: number upload_date: string + chunking_strategy: ChunkingStrategy } export interface ChunkInfo { @@ -55,6 +59,13 @@ export interface ChunkInfo { content_summary: string page_number: number | null chunk_file_path: string | null + strategy_type: ChunkingStrategy + question_index: number | null + question_id: string | null + question_text: string | null + topic_section: string | null + source_page_range: number[] | null + has_table: boolean | null } export interface DocumentListResponse {