legco_ai_assistant/backend/app/test/test_phase8_ingest.py

210 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
Covers:
- POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged
- POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied
- Invalid strategy values return 400
- IngestResponse includes strategy field
- DOCX with Q&A format uses question strategy
- Document without Q&A falls back gracefully
"""
import io
import json
from typing import List, Tuple
from unittest.mock import MagicMock
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from pypdf import PdfWriter
from app.routers.ingest import router
class _DeterministicEmbedding:
def name(self) -> str:
return "test_deterministic"
def __call__(self, input):
return self._embed(input)
def embed_query(self, input):
return self._embed(input)
@staticmethod
def _embed(texts):
vectors = []
for text in texts:
vec = [0.0] * 384
for i, ch in enumerate(text[:384]):
vec[i] = ord(ch) / 1000.0
vectors.append(vec)
return vectors
def _create_real_pdf(content: str) -> bytes:
writer = PdfWriter()
writer.add_blank_page(width=200, height=200)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
def _create_text_txt(content: str) -> bytes:
return content.encode("utf-8")
@pytest.fixture
def client(tmp_path, monkeypatch):
"""TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
chroma_path = str(tmp_path / "chroma_db")
chunk_path = str(tmp_path / "document_chunk")
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
monkeypatch.setenv("LLM_API_KEY", "test-key")
from app.core.config import get_settings
get_settings.cache_clear()
from app.core.dependencies import get_settings_cached
get_settings_cached.cache_clear()
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
conn = _get_db(prompts_path)
init_prompts_db(conn)
seed_default_profiles(conn)
conn.close()
hconn = _get_db(history_path)
init_history_db(hconn)
hconn.close()
monkeypatch.setattr(
"app.core.database.get_embedding_function_settings",
lambda settings: _DeterministicEmbedding(),
)
test_app = FastAPI()
test_app.include_router(router, prefix="/api/v1")
yield TestClient(test_app)
get_settings_cached.cache_clear()
get_settings.cache_clear()
def test_ingest_with_strategy_token(client):
"""Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
resp = client.post(
"/api/v1/ingest?strategy=token",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
data = resp.json()
assert data["strategy"] == "token"
assert data["chunk_count"] > 0
def test_ingest_invalid_strategy_rejected(client):
"""Invalid strategy values return 400."""
txt_bytes = _create_text_txt("test")
resp = client.post(
"/api/v1/ingest?strategy=invalid",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 400
assert "strategy" in resp.json()["detail"].lower()
def test_ingest_response_includes_strategy(client):
"""IngestResponse includes the strategy field."""
txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
resp = client.post(
"/api/v1/ingest?strategy=token",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
assert "strategy" in resp.json()
def test_ingest_default_strategy_is_token(client):
"""When no strategy param provided, default to token."""
txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
resp = client.post(
"/api/v1/ingest",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
assert resp.json()["strategy"] == "token"
def test_ingest_question_strategy_txt(client, monkeypatch):
"""TXT with Q&A format uses question strategy and produces chunks."""
_mock_question_chunker(monkeypatch)
txt_bytes = _create_text_txt("問A1test question\n答A1test answer with more text here to ensure chunking works properly.")
resp = client.post(
"/api/v1/ingest?strategy=question",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
data = resp.json()
assert data["strategy"] == "question"
assert data["chunk_count"] > 0
def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
"""Document without Q&A markers falls back to narrative chunking without error."""
_mock_question_chunker(monkeypatch)
txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
resp = client.post(
"/api/v1/ingest?strategy=question",
files={"file": ("plain.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
data = resp.json()
assert data["strategy"] == "question"
assert data["chunk_count"] > 0
def _mock_question_chunker(monkeypatch):
"""Replace QuestionChunkingStrategy with a mock that returns test chunks."""
class _MockQuestionChunker:
def __init__(self, settings=None, llm_client=None):
self._chunk_metadata = [
{
"strategy_type": "question",
"section_type": "qa",
"question_index": 0,
"question_id": "A1",
"question_text": "What is X?",
"section_heading": "(A) Topic",
"answer_contains_table": False,
"source_page_range": [1, 2],
}
]
self._max_tokens = 3000
def chunk(self, text):
self._chunk_metadata = self._chunk_metadata[:1]
return ["Question: What is X?\n\nAnswer: X is Y."]
def chunk_pages(self, pages, overlap_tokens=0):
self._chunk_metadata = self._chunk_metadata[:1]
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
monkeypatch.setattr(
"app.utils.chunking.QuestionChunkingStrategy",
_MockQuestionChunker,
)