test(backend): add Phase 4 unit tests for generate, format, history, prompts

9 tests for generate_response_per_subquestion() and answer format validation covering multi-sub-q, empty, prompt construction, and markdown format. 8 tests for new history XML/JSON formats (sources as list-of-lists, <sub_q> wrappers in XML) and new {context_sections} prompt template.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-26 23:28:58 +08:00
parent ab6ec28de6
commit dd98fa0b65
4 changed files with 514 additions and 0 deletions

View File

@ -0,0 +1,149 @@
"""Tests for RAGService.generate_response_per_subquestion() — Phase 4.3.
Covers sub-question-organized response generation:
- Two sub-questions with mixed chunk counts
- Empty input handling
- All-empty chunks fallback
- Prompt contains context_sections placeholder
- LLM client not configured fallback
"""
import pytest
from unittest.mock import AsyncMock, MagicMock
from app.services.rag import RAGService
# ---------------------------------------------------------------------------
# Test: two sub-questions, LLM returns markdown with headers
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_generate_per_subq_two_questions():
"""Two sub-questions, 2 chunks for first, 1 for second.
LLM returns markdown with ## Sub-question 1/2 headers.
Assert answer contains both headers and grouped_sources has correct shape.
"""
llm = MagicMock()
llm.complete = AsyncMock(return_value=(
"## Sub-question 1: What is A?\n"
"- Bullet point A1 [file_a.pdf, page 1]\n"
"- Bullet point A2 [file_a.pdf, page 2]\n\n"
"## Sub-question 2: What is B?\n"
"- Bullet point B1 [file_b.pdf, page 1]\n"
))
service = RAGService(llm_client=llm)
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
sub_questions=["What is A?", "What is B?"],
sub_chunks=[
["chunk A1 text", "chunk A2 text"],
["chunk B1 text"],
],
sub_metadata=[
[
{"filename": "file_a.pdf", "page_number": 1, "content_summary": "Summary A1"},
{"filename": "file_a.pdf", "page_number": 2, "content_summary": "Summary A2"},
],
[
{"filename": "file_b.pdf", "page_number": 1, "content_summary": "Summary B1"},
],
],
)
assert "## Sub-question 1: What is A?" in answer
assert "## Sub-question 2: What is B?" in answer
assert len(grouped_sources) == 2
assert len(grouped_sources[0]) == 2 # 2 sources for sub-q 0
assert len(grouped_sources[1]) == 1 # 1 source for sub-q 1
assert grouped_sources[0][0]["filename"] == "file_a.pdf"
assert grouped_sources[1][0]["filename"] == "file_b.pdf"
llm.complete.assert_called_once()
# ---------------------------------------------------------------------------
# Test: empty input
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_generate_per_subq_empty_input():
"""Empty sub_questions returns fallback message and empty grouped_sources."""
llm = MagicMock()
llm.complete = AsyncMock()
service = RAGService(llm_client=llm)
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
sub_questions=[],
sub_chunks=[],
sub_metadata=[],
)
assert answer == "I could not find any relevant information to answer your question."
assert grouped_sources == []
llm.complete.assert_not_called()
# ---------------------------------------------------------------------------
# Test: sub-questions provided but all chunk lists empty
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_generate_per_subq_no_chunks():
"""Sub-questions provided but all chunk lists empty → fallback message."""
llm = MagicMock()
llm.complete = AsyncMock()
service = RAGService(llm_client=llm)
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
sub_questions=["What is A?", "What is B?"],
sub_chunks=[[], []],
sub_metadata=[[], []],
)
assert answer == "I could not find any relevant information to answer your question."
assert grouped_sources == []
llm.complete.assert_not_called()
# ---------------------------------------------------------------------------
# Test: prompt contains context_sections placeholder
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_generate_per_subq_prompt_contains_context_sections():
"""Verify the prompt sent to LLM contains ### Context for Sub-question 0:
header and chunk content."""
captured_prompt = None
async def capture_complete(prompt, **kwargs):
nonlocal captured_prompt
captured_prompt = prompt
return "## Sub-question 1: What is A?\n- Answer"
llm = MagicMock()
llm.complete = AsyncMock(side_effect=capture_complete)
service = RAGService(llm_client=llm)
await service.generate_response_per_subquestion(
sub_questions=["What is A?"],
sub_chunks=[["chunk text here"]],
sub_metadata=[[{"filename": "file_a.pdf", "page_number": 1, "content_summary": "Sum"}]],
)
assert captured_prompt is not None
assert "### Context for Sub-question 0:" in captured_prompt
assert "chunk text here" in captured_prompt
assert "file_a.pdf" in captured_prompt
# ---------------------------------------------------------------------------
# Test: LLM client not configured
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_generate_per_subq_llm_not_configured():
"""llm_client=None → returns 'LLM client not configured' message."""
service = RAGService(llm_client=None)
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
sub_questions=["What is A?"],
sub_chunks=[["some chunk"]],
sub_metadata=[[{"filename": "a.pdf"}]],
)
assert answer == "LLM client not configured."
assert grouped_sources == []

View File

@ -0,0 +1,175 @@
"""Tests for Phase 4.4 — New history recording formats.
Covers:
- sources JSON grouped by sub-question (list-of-lists)
- Per-sub-question XML chunk formatting (retrieved & filtered)
- New per-sub-question count columns in query_history table
- Backward compatibility: old records have NULL in new columns
"""
import json
import os
import sqlite3
import tempfile
import pytest
from app.core.sqlite_db import init_history_db
from app.routers.query import (
format_chunks_retrieved_per_subq,
format_chunks_filtered_per_subq,
)
# ---------------------------------------------------------------------------
# Test: sources JSON is grouped by sub-question
# ---------------------------------------------------------------------------
def test_sources_json_is_grouped_by_subq():
"""The sources JSON stored in history must be a list-of-lists format.
Example: json.dumps([[s1, s2], [s3]]) instead of flat [s1, s2, s3].
"""
sources_nested = [
[
{"filename": "a.pdf", "page_number": 1},
{"filename": "a.pdf", "page_number": 2},
],
[
{"filename": "b.pdf", "page_number": 1},
],
]
sources_json = json.dumps(sources_nested)
# Parse it back — must be a list of lists
parsed = json.loads(sources_json)
assert isinstance(parsed, list)
assert len(parsed) == 2
assert isinstance(parsed[0], list)
assert len(parsed[0]) == 2
assert isinstance(parsed[1], list)
assert len(parsed[1]) == 1
assert parsed[0][0]["filename"] == "a.pdf"
assert parsed[1][0]["filename"] == "b.pdf"
# ---------------------------------------------------------------------------
# Test: chunks retrieved XML has sub_q wrappers
# ---------------------------------------------------------------------------
def test_chunks_retrieved_xml_has_subq_wrappers():
"""format_chunks_retrieved_per_subq must produce <sub_q idx="0" question="...">
wrappers around chunk elements."""
results = [
("What is A?", [
("chunk A1 text", {"filename": "a.pdf", "page_number": 1}, 0.5),
]),
("What is B?", [
("chunk B1 text", {"filename": "b.pdf", "page_number": 3}, 0.8),
("chunk B2 text", {"filename": "b.pdf", "page_number": 4}, 0.9),
]),
]
xml = format_chunks_retrieved_per_subq(results)
assert '<sub_q idx="0" question="What is A?">' in xml
assert '<sub_q idx="1" question="What is B?">' in xml
assert "</sub_q>" in xml
assert "chunk A1 text" in xml
assert "chunk B1 text" in xml
assert "chunk B2 text" in xml
assert "a.pdf" in xml
assert "b.pdf" in xml
# ---------------------------------------------------------------------------
# Test: chunks filtered XML has sub_q wrappers
# ---------------------------------------------------------------------------
def test_chunks_filtered_xml_has_subq_wrappers():
"""format_chunks_filtered_per_subq must produce <sub_q idx="0" question="...">
wrappers around chunk elements with relevance scores."""
results = [
("What is X?", [
("filtered X1", {"filename": "x.pdf", "page_number": 1, "relevance_score": 8}),
]),
("What is Y?", [
("filtered Y1", {"filename": "y.pdf", "page_number": 2, "relevance_score": 9}),
]),
]
xml = format_chunks_filtered_per_subq(results)
assert '<sub_q idx="0" question="What is X?">' in xml
assert '<sub_q idx="1" question="What is Y?">' in xml
assert "</sub_q>" in xml
assert "filtered X1" in xml
assert "filtered Y1" in xml
assert "Relevance: 8" in xml
assert "Relevance: 9" in xml
# ---------------------------------------------------------------------------
# Test: per-subq count columns exist in query_history table
# ---------------------------------------------------------------------------
def test_per_subq_count_columns_exist():
"""Verify that chunks_retrieved_per_subq_count and
chunks_filtered_per_subq_count columns exist in query_history."""
db_dir = tempfile.mkdtemp()
db_path = os.path.join(db_dir, "test_history.db")
conn = sqlite3.connect(db_path)
init_history_db(conn)
# Use PRAGMA table_info to get column names
cursor = conn.execute("PRAGMA table_info(query_history)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
assert "chunks_retrieved_per_subq_count" in columns, (
f"chunks_retrieved_per_subq_count not found. Columns: {columns}"
)
assert "chunks_filtered_per_subq_count" in columns, (
f"chunks_filtered_per_subq_count not found. Columns: {columns}"
)
# ---------------------------------------------------------------------------
# Test: existing records have NULL per-subq counts (migration safety)
# ---------------------------------------------------------------------------
def test_existing_records_have_null_per_subq_counts():
"""Pre-migration records should have NULL in the new per-subq columns."""
db_dir = tempfile.mkdtemp()
db_path = os.path.join(db_dir, "test_history_migrate.db")
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS query_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
input_text TEXT NOT NULL,
chunks_retrieved_count INTEGER DEFAULT 0,
chunks_filtered_count INTEGER DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
# Insert an "old" record WITHOUT the new columns
conn.execute(
"INSERT INTO query_history (input_text) VALUES (?)",
("old question",),
)
conn.commit()
# Now run the migration (init_history_db should add missing columns)
init_history_db(conn)
# Verify the old record has NULL in the new columns
row = conn.execute("SELECT * FROM query_history WHERE input_text = 'old question'").fetchone()
# Access by column name via description
col_names = [desc[0] for desc in conn.execute("SELECT * FROM query_history LIMIT 1").description]
# After migration, new columns should exist
assert "chunks_retrieved_per_subq_count" in col_names
assert "chunks_filtered_per_subq_count" in col_names
row_dict = dict(zip(col_names, row))
assert row_dict["chunks_retrieved_per_subq_count"] is None
assert row_dict["chunks_filtered_per_subq_count"] is None
conn.close()

View File

@ -0,0 +1,66 @@
"""Tests for Phase 4.4 — Prompt template updates for per-sub-question pipeline.
Covers:
- generate template uses {context_sections} placeholder (not {context})
- Built-in fallback template uses {context_sections}
- reset_to_defaults() resets generate to the new template
"""
import os
import tempfile
import pytest
from app.core.sqlite_db import init_prompts_db, seed_default_profiles
from app.services.prompt_service import PromptService
@pytest.fixture
def prompt_service():
"""Create a PromptService backed by a temp DB with seeded profiles."""
db_dir = tempfile.mkdtemp()
db_path = os.path.join(db_dir, "test_prompts.db")
import sqlite3
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
init_prompts_db(conn)
seed_default_profiles(conn)
conn.close()
return PromptService(db_path=db_path)
def test_generate_template_uses_context_sections(prompt_service):
"""When PromptService returns the generate template, {context_sections}
placeholder must be present and old {context} removed."""
template = prompt_service.get_prompt_template("generate")
assert "{context_sections}" in template
assert "{context}" not in template
def test_builtin_generate_template_has_context_sections():
"""When no prompt_service, the built-in seed template uses
{context_sections} instead of {context}."""
from app.core.sqlite_db import _SEED_GENERATE
assert "{context_sections}" in _SEED_GENERATE
assert "{context}" not in _SEED_GENERATE
def test_reset_to_defaults_includes_new_generate_template(prompt_service):
"""After calling reset_to_defaults() on a profile, the generate step
uses {context_sections} placeholder."""
profile_name = prompt_service.get_active_profile_name()
prompt_service.update_prompt(profile_name, "generate", "custom template with {context}")
modified = prompt_service.get_prompt_template("generate")
assert "{context}" in modified
assert "{context_sections}" not in modified
prompt_service.reset_to_defaults(profile_name)
template = prompt_service.get_prompt_template("generate")
assert "{context_sections}" in template
assert "{context}" not in template

View File

@ -0,0 +1,124 @@
"""Tests for per-sub-question response format validation — Phase 4.3.
Covers answer format invariants:
- Sub-question headers present in markdown
- Citation bracket labels in answer text
- grouped_sources match sub-question boundaries
- Single sub-question still uses header format
"""
import pytest
from unittest.mock import AsyncMock, MagicMock
from app.services.rag import RAGService
# ---------------------------------------------------------------------------
# Test: answer has sub-question headers
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_answer_has_subquestion_headers():
"""Answer string contains ## Sub-question N: headers."""
llm = MagicMock()
llm.complete = AsyncMock(return_value=(
"## Sub-question 1: First question?\n"
"- Point one [doc.pdf, page 1]\n\n"
"## Sub-question 2: Second question?\n"
"- Point two [doc.pdf, page 2]\n"
))
service = RAGService(llm_client=llm)
answer, _prompt, _sources = await service.generate_response_per_subquestion(
sub_questions=["First question?", "Second question?"],
sub_chunks=[["chunk1"], ["chunk2"]],
sub_metadata=[
[{"filename": "doc.pdf", "page_number": 1}],
[{"filename": "doc.pdf", "page_number": 2}],
],
)
assert "## Sub-question 1:" in answer
assert "## Sub-question 2:" in answer
# ---------------------------------------------------------------------------
# Test: citations use bracket labels
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_answer_citations_use_bracket_labels():
"""Answer contains [filename, page N] citation format."""
llm = MagicMock()
llm.complete = AsyncMock(return_value=(
"## Sub-question 1: What is X?\n"
"- X is defined as a variable [report.pdf, page 5]\n"
))
service = RAGService(llm_client=llm)
answer, _prompt, _sources = await service.generate_response_per_subquestion(
sub_questions=["What is X?"],
sub_chunks=[["chunk about X"]],
sub_metadata=[[{"filename": "report.pdf", "page_number": 5}]],
)
assert "[report.pdf, page 5]" in answer
# ---------------------------------------------------------------------------
# Test: grouped_sources match sub-question boundaries
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_grouped_sources_match_subquestions():
"""Each sub-question's source list only contains metadata from its own chunks."""
llm = MagicMock()
llm.complete = AsyncMock(return_value=(
"## Sub-question 1: Q1?\n- A1\n\n## Sub-question 2: Q2?\n- A2\n"
))
service = RAGService(llm_client=llm)
_answer, _prompt, grouped_sources = await service.generate_response_per_subquestion(
sub_questions=["Q1?", "Q2?"],
sub_chunks=[
["chunk_alpha", "chunk_beta"],
["chunk_gamma"],
],
sub_metadata=[
[
{"filename": "alpha.pdf", "page_number": 1},
{"filename": "beta.pdf", "page_number": 2},
],
[
{"filename": "gamma.pdf", "page_number": 3},
],
],
)
assert len(grouped_sources) == 2
# Sub-q 0 sources should only contain alpha and beta
filenames_0 = {m["filename"] for m in grouped_sources[0]}
assert filenames_0 == {"alpha.pdf", "beta.pdf"}
# Sub-q 1 sources should only contain gamma
filenames_1 = {m["filename"] for m in grouped_sources[1]}
assert filenames_1 == {"gamma.pdf"}
# ---------------------------------------------------------------------------
# Test: single sub-question still uses header format
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_single_subquestion_format():
"""When only one sub-question, answer still uses ## Sub-question 1: header."""
llm = MagicMock()
llm.complete = AsyncMock(return_value=(
"## Sub-question 1: What is this?\n"
"- It is a test [test.pdf, page 1]\n"
))
service = RAGService(llm_client=llm)
answer, _prompt, grouped_sources = await service.generate_response_per_subquestion(
sub_questions=["What is this?"],
sub_chunks=[["test chunk"]],
sub_metadata=[[{"filename": "test.pdf", "page_number": 1}]],
)
assert "## Sub-question 1:" in answer
assert len(grouped_sources) == 1
assert len(grouped_sources[0]) == 1