test(backend): add Phase 4 unit tests for generate, format, history, prompts
9 tests for generate_response_per_subquestion() and answer format validation covering multi-sub-q, empty, prompt construction, and markdown format. 8 tests for new history XML/JSON formats (sources as list-of-lists, <sub_q> wrappers in XML) and new {context_sections} prompt template.
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
ab6ec28de6
commit
dd98fa0b65
|
|
@ -0,0 +1,149 @@
|
|||
"""Tests for RAGService.generate_response_per_subquestion() — Phase 4.3.
|
||||
|
||||
Covers sub-question-organized response generation:
|
||||
- Two sub-questions with mixed chunk counts
|
||||
- Empty input handling
|
||||
- All-empty chunks fallback
|
||||
- Prompt contains context_sections placeholder
|
||||
- LLM client not configured fallback
|
||||
"""
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from app.services.rag import RAGService
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: two sub-questions, LLM returns markdown with headers
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_per_subq_two_questions():
|
||||
"""Two sub-questions, 2 chunks for first, 1 for second.
|
||||
|
||||
LLM returns markdown with ## Sub-question 1/2 headers.
|
||||
Assert answer contains both headers and grouped_sources has correct shape.
|
||||
"""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock(return_value=(
|
||||
"## Sub-question 1: What is A?\n"
|
||||
"- Bullet point A1 [file_a.pdf, page 1]\n"
|
||||
"- Bullet point A2 [file_a.pdf, page 2]\n\n"
|
||||
"## Sub-question 2: What is B?\n"
|
||||
"- Bullet point B1 [file_b.pdf, page 1]\n"
|
||||
))
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["What is A?", "What is B?"],
|
||||
sub_chunks=[
|
||||
["chunk A1 text", "chunk A2 text"],
|
||||
["chunk B1 text"],
|
||||
],
|
||||
sub_metadata=[
|
||||
[
|
||||
{"filename": "file_a.pdf", "page_number": 1, "content_summary": "Summary A1"},
|
||||
{"filename": "file_a.pdf", "page_number": 2, "content_summary": "Summary A2"},
|
||||
],
|
||||
[
|
||||
{"filename": "file_b.pdf", "page_number": 1, "content_summary": "Summary B1"},
|
||||
],
|
||||
],
|
||||
)
|
||||
|
||||
assert "## Sub-question 1: What is A?" in answer
|
||||
assert "## Sub-question 2: What is B?" in answer
|
||||
assert len(grouped_sources) == 2
|
||||
assert len(grouped_sources[0]) == 2 # 2 sources for sub-q 0
|
||||
assert len(grouped_sources[1]) == 1 # 1 source for sub-q 1
|
||||
assert grouped_sources[0][0]["filename"] == "file_a.pdf"
|
||||
assert grouped_sources[1][0]["filename"] == "file_b.pdf"
|
||||
llm.complete.assert_called_once()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: empty input
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_per_subq_empty_input():
|
||||
"""Empty sub_questions returns fallback message and empty grouped_sources."""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock()
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=[],
|
||||
sub_chunks=[],
|
||||
sub_metadata=[],
|
||||
)
|
||||
|
||||
assert answer == "I could not find any relevant information to answer your question."
|
||||
assert grouped_sources == []
|
||||
llm.complete.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: sub-questions provided but all chunk lists empty
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_per_subq_no_chunks():
|
||||
"""Sub-questions provided but all chunk lists empty → fallback message."""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock()
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["What is A?", "What is B?"],
|
||||
sub_chunks=[[], []],
|
||||
sub_metadata=[[], []],
|
||||
)
|
||||
|
||||
assert answer == "I could not find any relevant information to answer your question."
|
||||
assert grouped_sources == []
|
||||
llm.complete.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: prompt contains context_sections placeholder
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_per_subq_prompt_contains_context_sections():
|
||||
"""Verify the prompt sent to LLM contains ### Context for Sub-question 0:
|
||||
header and chunk content."""
|
||||
captured_prompt = None
|
||||
|
||||
async def capture_complete(prompt, **kwargs):
|
||||
nonlocal captured_prompt
|
||||
captured_prompt = prompt
|
||||
return "## Sub-question 1: What is A?\n- Answer"
|
||||
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock(side_effect=capture_complete)
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
await service.generate_response_per_subquestion(
|
||||
sub_questions=["What is A?"],
|
||||
sub_chunks=[["chunk text here"]],
|
||||
sub_metadata=[[{"filename": "file_a.pdf", "page_number": 1, "content_summary": "Sum"}]],
|
||||
)
|
||||
|
||||
assert captured_prompt is not None
|
||||
assert "### Context for Sub-question 0:" in captured_prompt
|
||||
assert "chunk text here" in captured_prompt
|
||||
assert "file_a.pdf" in captured_prompt
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: LLM client not configured
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_per_subq_llm_not_configured():
|
||||
"""llm_client=None → returns 'LLM client not configured' message."""
|
||||
service = RAGService(llm_client=None)
|
||||
answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["What is A?"],
|
||||
sub_chunks=[["some chunk"]],
|
||||
sub_metadata=[[{"filename": "a.pdf"}]],
|
||||
)
|
||||
|
||||
assert answer == "LLM client not configured."
|
||||
assert grouped_sources == []
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
"""Tests for Phase 4.4 — New history recording formats.
|
||||
|
||||
Covers:
|
||||
- sources JSON grouped by sub-question (list-of-lists)
|
||||
- Per-sub-question XML chunk formatting (retrieved & filtered)
|
||||
- New per-sub-question count columns in query_history table
|
||||
- Backward compatibility: old records have NULL in new columns
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from app.core.sqlite_db import init_history_db
|
||||
from app.routers.query import (
|
||||
format_chunks_retrieved_per_subq,
|
||||
format_chunks_filtered_per_subq,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: sources JSON is grouped by sub-question
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_sources_json_is_grouped_by_subq():
|
||||
"""The sources JSON stored in history must be a list-of-lists format.
|
||||
|
||||
Example: json.dumps([[s1, s2], [s3]]) instead of flat [s1, s2, s3].
|
||||
"""
|
||||
sources_nested = [
|
||||
[
|
||||
{"filename": "a.pdf", "page_number": 1},
|
||||
{"filename": "a.pdf", "page_number": 2},
|
||||
],
|
||||
[
|
||||
{"filename": "b.pdf", "page_number": 1},
|
||||
],
|
||||
]
|
||||
sources_json = json.dumps(sources_nested)
|
||||
|
||||
# Parse it back — must be a list of lists
|
||||
parsed = json.loads(sources_json)
|
||||
assert isinstance(parsed, list)
|
||||
assert len(parsed) == 2
|
||||
assert isinstance(parsed[0], list)
|
||||
assert len(parsed[0]) == 2
|
||||
assert isinstance(parsed[1], list)
|
||||
assert len(parsed[1]) == 1
|
||||
assert parsed[0][0]["filename"] == "a.pdf"
|
||||
assert parsed[1][0]["filename"] == "b.pdf"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: chunks retrieved XML has sub_q wrappers
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_chunks_retrieved_xml_has_subq_wrappers():
|
||||
"""format_chunks_retrieved_per_subq must produce <sub_q idx="0" question="...">
|
||||
wrappers around chunk elements."""
|
||||
results = [
|
||||
("What is A?", [
|
||||
("chunk A1 text", {"filename": "a.pdf", "page_number": 1}, 0.5),
|
||||
]),
|
||||
("What is B?", [
|
||||
("chunk B1 text", {"filename": "b.pdf", "page_number": 3}, 0.8),
|
||||
("chunk B2 text", {"filename": "b.pdf", "page_number": 4}, 0.9),
|
||||
]),
|
||||
]
|
||||
|
||||
xml = format_chunks_retrieved_per_subq(results)
|
||||
|
||||
assert '<sub_q idx="0" question="What is A?">' in xml
|
||||
assert '<sub_q idx="1" question="What is B?">' in xml
|
||||
assert "</sub_q>" in xml
|
||||
assert "chunk A1 text" in xml
|
||||
assert "chunk B1 text" in xml
|
||||
assert "chunk B2 text" in xml
|
||||
assert "a.pdf" in xml
|
||||
assert "b.pdf" in xml
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: chunks filtered XML has sub_q wrappers
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_chunks_filtered_xml_has_subq_wrappers():
|
||||
"""format_chunks_filtered_per_subq must produce <sub_q idx="0" question="...">
|
||||
wrappers around chunk elements with relevance scores."""
|
||||
results = [
|
||||
("What is X?", [
|
||||
("filtered X1", {"filename": "x.pdf", "page_number": 1, "relevance_score": 8}),
|
||||
]),
|
||||
("What is Y?", [
|
||||
("filtered Y1", {"filename": "y.pdf", "page_number": 2, "relevance_score": 9}),
|
||||
]),
|
||||
]
|
||||
|
||||
xml = format_chunks_filtered_per_subq(results)
|
||||
|
||||
assert '<sub_q idx="0" question="What is X?">' in xml
|
||||
assert '<sub_q idx="1" question="What is Y?">' in xml
|
||||
assert "</sub_q>" in xml
|
||||
assert "filtered X1" in xml
|
||||
assert "filtered Y1" in xml
|
||||
assert "Relevance: 8" in xml
|
||||
assert "Relevance: 9" in xml
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: per-subq count columns exist in query_history table
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_per_subq_count_columns_exist():
|
||||
"""Verify that chunks_retrieved_per_subq_count and
|
||||
chunks_filtered_per_subq_count columns exist in query_history."""
|
||||
db_dir = tempfile.mkdtemp()
|
||||
db_path = os.path.join(db_dir, "test_history.db")
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
init_history_db(conn)
|
||||
|
||||
# Use PRAGMA table_info to get column names
|
||||
cursor = conn.execute("PRAGMA table_info(query_history)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
assert "chunks_retrieved_per_subq_count" in columns, (
|
||||
f"chunks_retrieved_per_subq_count not found. Columns: {columns}"
|
||||
)
|
||||
assert "chunks_filtered_per_subq_count" in columns, (
|
||||
f"chunks_filtered_per_subq_count not found. Columns: {columns}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: existing records have NULL per-subq counts (migration safety)
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_existing_records_have_null_per_subq_counts():
|
||||
"""Pre-migration records should have NULL in the new per-subq columns."""
|
||||
db_dir = tempfile.mkdtemp()
|
||||
db_path = os.path.join(db_dir, "test_history_migrate.db")
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS query_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
input_text TEXT NOT NULL,
|
||||
chunks_retrieved_count INTEGER DEFAULT 0,
|
||||
chunks_filtered_count INTEGER DEFAULT 0,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
# Insert an "old" record WITHOUT the new columns
|
||||
conn.execute(
|
||||
"INSERT INTO query_history (input_text) VALUES (?)",
|
||||
("old question",),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Now run the migration (init_history_db should add missing columns)
|
||||
init_history_db(conn)
|
||||
|
||||
# Verify the old record has NULL in the new columns
|
||||
row = conn.execute("SELECT * FROM query_history WHERE input_text = 'old question'").fetchone()
|
||||
|
||||
# Access by column name via description
|
||||
col_names = [desc[0] for desc in conn.execute("SELECT * FROM query_history LIMIT 1").description]
|
||||
|
||||
# After migration, new columns should exist
|
||||
assert "chunks_retrieved_per_subq_count" in col_names
|
||||
assert "chunks_filtered_per_subq_count" in col_names
|
||||
|
||||
row_dict = dict(zip(col_names, row))
|
||||
assert row_dict["chunks_retrieved_per_subq_count"] is None
|
||||
assert row_dict["chunks_filtered_per_subq_count"] is None
|
||||
|
||||
conn.close()
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
"""Tests for Phase 4.4 — Prompt template updates for per-sub-question pipeline.
|
||||
|
||||
Covers:
|
||||
- generate template uses {context_sections} placeholder (not {context})
|
||||
- Built-in fallback template uses {context_sections}
|
||||
- reset_to_defaults() resets generate to the new template
|
||||
"""
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from app.core.sqlite_db import init_prompts_db, seed_default_profiles
|
||||
from app.services.prompt_service import PromptService
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def prompt_service():
|
||||
"""Create a PromptService backed by a temp DB with seeded profiles."""
|
||||
db_dir = tempfile.mkdtemp()
|
||||
db_path = os.path.join(db_dir, "test_prompts.db")
|
||||
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
init_prompts_db(conn)
|
||||
seed_default_profiles(conn)
|
||||
conn.close()
|
||||
|
||||
return PromptService(db_path=db_path)
|
||||
|
||||
|
||||
def test_generate_template_uses_context_sections(prompt_service):
|
||||
"""When PromptService returns the generate template, {context_sections}
|
||||
placeholder must be present and old {context} removed."""
|
||||
template = prompt_service.get_prompt_template("generate")
|
||||
|
||||
assert "{context_sections}" in template
|
||||
assert "{context}" not in template
|
||||
|
||||
|
||||
def test_builtin_generate_template_has_context_sections():
|
||||
"""When no prompt_service, the built-in seed template uses
|
||||
{context_sections} instead of {context}."""
|
||||
from app.core.sqlite_db import _SEED_GENERATE
|
||||
|
||||
assert "{context_sections}" in _SEED_GENERATE
|
||||
assert "{context}" not in _SEED_GENERATE
|
||||
|
||||
|
||||
def test_reset_to_defaults_includes_new_generate_template(prompt_service):
|
||||
"""After calling reset_to_defaults() on a profile, the generate step
|
||||
uses {context_sections} placeholder."""
|
||||
profile_name = prompt_service.get_active_profile_name()
|
||||
|
||||
prompt_service.update_prompt(profile_name, "generate", "custom template with {context}")
|
||||
|
||||
modified = prompt_service.get_prompt_template("generate")
|
||||
assert "{context}" in modified
|
||||
assert "{context_sections}" not in modified
|
||||
|
||||
prompt_service.reset_to_defaults(profile_name)
|
||||
|
||||
template = prompt_service.get_prompt_template("generate")
|
||||
assert "{context_sections}" in template
|
||||
assert "{context}" not in template
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
"""Tests for per-sub-question response format validation — Phase 4.3.
|
||||
|
||||
Covers answer format invariants:
|
||||
- Sub-question headers present in markdown
|
||||
- Citation bracket labels in answer text
|
||||
- grouped_sources match sub-question boundaries
|
||||
- Single sub-question still uses header format
|
||||
"""
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from app.services.rag import RAGService
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: answer has sub-question headers
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_answer_has_subquestion_headers():
|
||||
"""Answer string contains ## Sub-question N: headers."""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock(return_value=(
|
||||
"## Sub-question 1: First question?\n"
|
||||
"- Point one [doc.pdf, page 1]\n\n"
|
||||
"## Sub-question 2: Second question?\n"
|
||||
"- Point two [doc.pdf, page 2]\n"
|
||||
))
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
answer, _prompt, _sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["First question?", "Second question?"],
|
||||
sub_chunks=[["chunk1"], ["chunk2"]],
|
||||
sub_metadata=[
|
||||
[{"filename": "doc.pdf", "page_number": 1}],
|
||||
[{"filename": "doc.pdf", "page_number": 2}],
|
||||
],
|
||||
)
|
||||
|
||||
assert "## Sub-question 1:" in answer
|
||||
assert "## Sub-question 2:" in answer
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: citations use bracket labels
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_answer_citations_use_bracket_labels():
|
||||
"""Answer contains [filename, page N] citation format."""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock(return_value=(
|
||||
"## Sub-question 1: What is X?\n"
|
||||
"- X is defined as a variable [report.pdf, page 5]\n"
|
||||
))
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
answer, _prompt, _sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["What is X?"],
|
||||
sub_chunks=[["chunk about X"]],
|
||||
sub_metadata=[[{"filename": "report.pdf", "page_number": 5}]],
|
||||
)
|
||||
|
||||
assert "[report.pdf, page 5]" in answer
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: grouped_sources match sub-question boundaries
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_grouped_sources_match_subquestions():
|
||||
"""Each sub-question's source list only contains metadata from its own chunks."""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock(return_value=(
|
||||
"## Sub-question 1: Q1?\n- A1\n\n## Sub-question 2: Q2?\n- A2\n"
|
||||
))
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
_answer, _prompt, grouped_sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["Q1?", "Q2?"],
|
||||
sub_chunks=[
|
||||
["chunk_alpha", "chunk_beta"],
|
||||
["chunk_gamma"],
|
||||
],
|
||||
sub_metadata=[
|
||||
[
|
||||
{"filename": "alpha.pdf", "page_number": 1},
|
||||
{"filename": "beta.pdf", "page_number": 2},
|
||||
],
|
||||
[
|
||||
{"filename": "gamma.pdf", "page_number": 3},
|
||||
],
|
||||
],
|
||||
)
|
||||
|
||||
assert len(grouped_sources) == 2
|
||||
# Sub-q 0 sources should only contain alpha and beta
|
||||
filenames_0 = {m["filename"] for m in grouped_sources[0]}
|
||||
assert filenames_0 == {"alpha.pdf", "beta.pdf"}
|
||||
# Sub-q 1 sources should only contain gamma
|
||||
filenames_1 = {m["filename"] for m in grouped_sources[1]}
|
||||
assert filenames_1 == {"gamma.pdf"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: single sub-question still uses header format
|
||||
# ---------------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_subquestion_format():
|
||||
"""When only one sub-question, answer still uses ## Sub-question 1: header."""
|
||||
llm = MagicMock()
|
||||
llm.complete = AsyncMock(return_value=(
|
||||
"## Sub-question 1: What is this?\n"
|
||||
"- It is a test [test.pdf, page 1]\n"
|
||||
))
|
||||
|
||||
service = RAGService(llm_client=llm)
|
||||
answer, _prompt, grouped_sources = await service.generate_response_per_subquestion(
|
||||
sub_questions=["What is this?"],
|
||||
sub_chunks=[["test chunk"]],
|
||||
sub_metadata=[[{"filename": "test.pdf", "page_number": 1}]],
|
||||
)
|
||||
|
||||
assert "## Sub-question 1:" in answer
|
||||
assert len(grouped_sources) == 1
|
||||
assert len(grouped_sources[0]) == 1
|
||||
Loading…
Reference in New Issue