feat: Sub-Phase 8.0 — config & enums for Q&A-pair chunking strategy
Backend:
- Add 6 Q&A chunking config fields to Settings (default_chunking_strategy,
qa_vision_enabled, qa_max_chunk_tokens, qa_structure_model,
qa_include_internal_refs, qa_cache_vision_results)
- Define ChunkingStrategyType Literal + VALID_CHUNKING_STRATEGIES frozenset
- Add strategy field to IngestResponse (default token, non-breaking)
- Add IngestRequest model with strategy param
- Update .env.example with new env vars
Frontend:
- Add ChunkingStrategy type ('token' | 'question')
- Extend IngestResponse, DocumentInfo, ChunkInfo with Q&A fields
Tests:
- test_qa_chunking_config_defaults — all defaults verified
- test_qa_chunking_config_from_env — env var overrides verified
Plan fix: renamed qa_verification_model → qa_structure_model to match
LLM-first architecture
This commit is contained in:
parent
6bf04cedb1
commit
ef10b937cf
|
|
@ -327,7 +327,7 @@ For each section in the JSON response:
|
||||||
If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when:
|
If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when:
|
||||||
- No regex pattern matches (unknown format)
|
- No regex pattern matches (unknown format)
|
||||||
- Regex produces < 2 sections (likely misdetection)
|
- Regex produces < 2 sections (likely misdetection)
|
||||||
- `qa_verification_model` is not set to `"none"`
|
- `qa_structure_model` is not set to `"none"`
|
||||||
|
|
||||||
### Algorithm Detail: Table-to-Markdown
|
### Algorithm Detail: Table-to-Markdown
|
||||||
|
|
||||||
|
|
@ -382,7 +382,7 @@ class Settings(BaseSettings):
|
||||||
# NEW: Q&A chunking config
|
# NEW: Q&A chunking config
|
||||||
qa_vision_enabled: bool = True # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME)
|
qa_vision_enabled: bool = True # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME)
|
||||||
qa_max_chunk_tokens: int = 3000 # Max tokens per Q&A chunk (before forced split)
|
qa_max_chunk_tokens: int = 3000 # Max tokens per Q&A chunk (before forced split)
|
||||||
qa_verification_model: str = "" # LLM for boundary verification (empty = use LLM_MODEL_NAME)
|
qa_structure_model: str = "" # LLM for structure detection (empty = use LLM_MODEL_NAME)
|
||||||
qa_include_internal_refs: bool = True # Include [內部參考] in chunks
|
qa_include_internal_refs: bool = True # Include [內部參考] in chunks
|
||||||
qa_cache_vision_results: bool = True # Cache vision results per page
|
qa_cache_vision_results: bool = True # Cache vision results per page
|
||||||
|
|
||||||
|
|
@ -390,7 +390,7 @@ class Settings(BaseSettings):
|
||||||
# DEFAULT_CHUNKING_STRATEGY=token
|
# DEFAULT_CHUNKING_STRATEGY=token
|
||||||
# QA_VISION_ENABLED=true
|
# QA_VISION_ENABLED=true
|
||||||
# QA_MAX_CHUNK_TOKENS=3000
|
# QA_MAX_CHUNK_TOKENS=3000
|
||||||
# QA_VERIFICATION_MODEL=
|
# QA_STRUCTURE_MODEL=
|
||||||
# QA_INCLUDE_INTERNAL_REFS=true
|
# QA_INCLUDE_INTERNAL_REFS=true
|
||||||
# QA_CACHE_VISION_RESULTS=true
|
# QA_CACHE_VISION_RESULTS=true
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,3 +41,11 @@ MAX_VIDEO_SIZE_MB=300
|
||||||
# Set to false to disable System Audio or Listen Mic capture
|
# Set to false to disable System Audio or Listen Mic capture
|
||||||
SYSTEM_AUDIO_ENABLED=true
|
SYSTEM_AUDIO_ENABLED=true
|
||||||
MIC_ENABLED=true
|
MIC_ENABLED=true
|
||||||
|
|
||||||
|
# Q&A-pair chunking (Package 8)
|
||||||
|
DEFAULT_CHUNKING_STRATEGY=token
|
||||||
|
QA_VISION_ENABLED=true
|
||||||
|
QA_MAX_CHUNK_TOKENS=3000
|
||||||
|
QA_STRUCTURE_MODEL=
|
||||||
|
QA_INCLUDE_INTERNAL_REFS=true
|
||||||
|
QA_CACHE_VISION_RESULTS=true
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,14 @@ class Settings(BaseSettings):
|
||||||
relevance_threshold: float = 7.0
|
relevance_threshold: float = 7.0
|
||||||
llm_timeout: float = 60.0
|
llm_timeout: float = 60.0
|
||||||
|
|
||||||
|
# Q&A-pair chunking strategy (Package 8)
|
||||||
|
default_chunking_strategy: str = "token"
|
||||||
|
qa_vision_enabled: bool = True
|
||||||
|
qa_max_chunk_tokens: int = 3000
|
||||||
|
qa_structure_model: str = ""
|
||||||
|
qa_include_internal_refs: bool = True
|
||||||
|
qa_cache_vision_results: bool = True
|
||||||
|
|
||||||
# Alibaba Cloud DashScope ASR (Phase 2)
|
# Alibaba Cloud DashScope ASR (Phase 2)
|
||||||
dashscope_api_key: str = ""
|
dashscope_api_key: str = ""
|
||||||
asr_model_name: str = "qwen3-asr-flash"
|
asr_model_name: str = "qwen3-asr-flash"
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,18 @@
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
ChunkingStrategyType = Literal["token", "question"]
|
||||||
|
|
||||||
|
VALID_CHUNKING_STRATEGIES = frozenset({"token", "question"})
|
||||||
|
|
||||||
|
|
||||||
|
class IngestRequest(BaseModel):
|
||||||
|
strategy: ChunkingStrategyType = "token"
|
||||||
|
|
||||||
|
|
||||||
class IngestResponse(BaseModel):
|
class IngestResponse(BaseModel):
|
||||||
document_id: str
|
document_id: str
|
||||||
chunk_count: int
|
chunk_count: int
|
||||||
filename: str
|
filename: str
|
||||||
|
strategy: ChunkingStrategyType = "token"
|
||||||
|
|
|
||||||
|
|
@ -31,3 +31,47 @@ def test_config_default_values(monkeypatch):
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
assert settings.llm_base_url == "https://openrouter.ai/api/v1"
|
assert settings.llm_base_url == "https://openrouter.ai/api/v1"
|
||||||
assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b"
|
assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b"
|
||||||
|
|
||||||
|
|
||||||
|
def test_qa_chunking_config_defaults(monkeypatch):
|
||||||
|
"""Phase 8.0: Q&A chunking config fields have correct defaults."""
|
||||||
|
monkeypatch.delenv("DEFAULT_CHUNKING_STRATEGY", raising=False)
|
||||||
|
monkeypatch.delenv("QA_VISION_ENABLED", raising=False)
|
||||||
|
monkeypatch.delenv("QA_MAX_CHUNK_TOKENS", raising=False)
|
||||||
|
monkeypatch.delenv("QA_STRUCTURE_MODEL", raising=False)
|
||||||
|
monkeypatch.delenv("QA_INCLUDE_INTERNAL_REFS", raising=False)
|
||||||
|
monkeypatch.delenv("QA_CACHE_VISION_RESULTS", raising=False)
|
||||||
|
|
||||||
|
from app.core.config import Settings
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
|
assert settings.default_chunking_strategy == "token"
|
||||||
|
assert settings.qa_vision_enabled is True
|
||||||
|
assert settings.qa_max_chunk_tokens == 3000
|
||||||
|
assert settings.qa_structure_model == ""
|
||||||
|
assert settings.qa_include_internal_refs is True
|
||||||
|
assert settings.qa_cache_vision_results is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_qa_chunking_config_from_env(tmp_path, monkeypatch):
|
||||||
|
"""Phase 8.0: Q&A chunking config fields load from .env."""
|
||||||
|
env_file = tmp_path / ".env"
|
||||||
|
env_file.write_text(
|
||||||
|
"DEFAULT_CHUNKING_STRATEGY=question\n"
|
||||||
|
"QA_VISION_ENABLED=false\n"
|
||||||
|
"QA_MAX_CHUNK_TOKENS=5000\n"
|
||||||
|
"QA_STRUCTURE_MODEL=anthropic/claude-3-haiku\n"
|
||||||
|
"QA_INCLUDE_INTERNAL_REFS=false\n"
|
||||||
|
"QA_CACHE_VISION_RESULTS=false\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
from app.core.config import Settings
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
|
assert settings.default_chunking_strategy == "question"
|
||||||
|
assert settings.qa_vision_enabled is False
|
||||||
|
assert settings.qa_max_chunk_tokens == 5000
|
||||||
|
assert settings.qa_structure_model == "anthropic/claude-3-haiku"
|
||||||
|
assert settings.qa_include_internal_refs is False
|
||||||
|
assert settings.qa_cache_vision_results is False
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
export type ChunkingStrategy = 'token' | 'question'
|
||||||
|
|
||||||
export interface SourceMetadata {
|
export interface SourceMetadata {
|
||||||
filename: string
|
filename: string
|
||||||
upload_date: string
|
upload_date: string
|
||||||
|
|
@ -40,6 +42,7 @@ export interface IngestResponse {
|
||||||
document_id: string
|
document_id: string
|
||||||
chunk_count: number
|
chunk_count: number
|
||||||
filename: string
|
filename: string
|
||||||
|
strategy: ChunkingStrategy
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface DocumentInfo {
|
export interface DocumentInfo {
|
||||||
|
|
@ -47,6 +50,7 @@ export interface DocumentInfo {
|
||||||
filename: string
|
filename: string
|
||||||
chunk_count: number
|
chunk_count: number
|
||||||
upload_date: string
|
upload_date: string
|
||||||
|
chunking_strategy: ChunkingStrategy
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ChunkInfo {
|
export interface ChunkInfo {
|
||||||
|
|
@ -55,6 +59,13 @@ export interface ChunkInfo {
|
||||||
content_summary: string
|
content_summary: string
|
||||||
page_number: number | null
|
page_number: number | null
|
||||||
chunk_file_path: string | null
|
chunk_file_path: string | null
|
||||||
|
strategy_type: ChunkingStrategy
|
||||||
|
question_index: number | null
|
||||||
|
question_id: string | null
|
||||||
|
question_text: string | null
|
||||||
|
topic_section: string | null
|
||||||
|
source_page_range: number[] | null
|
||||||
|
has_table: boolean | null
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface DocumentListResponse {
|
export interface DocumentListResponse {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue