From dd98fa0b65d6109629334fece099aa34637e9fcd Mon Sep 17 00:00:00 2001 From: Woody Date: Sun, 26 Apr 2026 23:28:58 +0800 Subject: [PATCH] test(backend): add Phase 4 unit tests for generate, format, history, prompts 9 tests for generate_response_per_subquestion() and answer format validation covering multi-sub-q, empty, prompt construction, and markdown format. 8 tests for new history XML/JSON formats (sources as list-of-lists, wrappers in XML) and new {context_sections} prompt template. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- .../app/test/test_phase4_generate_per_subq.py | 149 +++++++++++++++ .../app/test/test_phase4_history_format.py | 175 ++++++++++++++++++ .../app/test/test_phase4_prompt_templates.py | 66 +++++++ .../app/test/test_phase4_response_format.py | 124 +++++++++++++ 4 files changed, 514 insertions(+) create mode 100644 backend/app/test/test_phase4_generate_per_subq.py create mode 100644 backend/app/test/test_phase4_history_format.py create mode 100644 backend/app/test/test_phase4_prompt_templates.py create mode 100644 backend/app/test/test_phase4_response_format.py diff --git a/backend/app/test/test_phase4_generate_per_subq.py b/backend/app/test/test_phase4_generate_per_subq.py new file mode 100644 index 0000000..1b843c7 --- /dev/null +++ b/backend/app/test/test_phase4_generate_per_subq.py @@ -0,0 +1,149 @@ +"""Tests for RAGService.generate_response_per_subquestion() — Phase 4.3. + +Covers sub-question-organized response generation: +- Two sub-questions with mixed chunk counts +- Empty input handling +- All-empty chunks fallback +- Prompt contains context_sections placeholder +- LLM client not configured fallback +""" +import pytest +from unittest.mock import AsyncMock, MagicMock + +from app.services.rag import RAGService + + +# --------------------------------------------------------------------------- +# Test: two sub-questions, LLM returns markdown with headers +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_generate_per_subq_two_questions(): + """Two sub-questions, 2 chunks for first, 1 for second. + + LLM returns markdown with ## Sub-question 1/2 headers. + Assert answer contains both headers and grouped_sources has correct shape. + """ + llm = MagicMock() + llm.complete = AsyncMock(return_value=( + "## Sub-question 1: What is A?\n" + "- Bullet point A1 [file_a.pdf, page 1]\n" + "- Bullet point A2 [file_a.pdf, page 2]\n\n" + "## Sub-question 2: What is B?\n" + "- Bullet point B1 [file_b.pdf, page 1]\n" + )) + + service = RAGService(llm_client=llm) + answer, prompt, grouped_sources = await service.generate_response_per_subquestion( + sub_questions=["What is A?", "What is B?"], + sub_chunks=[ + ["chunk A1 text", "chunk A2 text"], + ["chunk B1 text"], + ], + sub_metadata=[ + [ + {"filename": "file_a.pdf", "page_number": 1, "content_summary": "Summary A1"}, + {"filename": "file_a.pdf", "page_number": 2, "content_summary": "Summary A2"}, + ], + [ + {"filename": "file_b.pdf", "page_number": 1, "content_summary": "Summary B1"}, + ], + ], + ) + + assert "## Sub-question 1: What is A?" in answer + assert "## Sub-question 2: What is B?" in answer + assert len(grouped_sources) == 2 + assert len(grouped_sources[0]) == 2 # 2 sources for sub-q 0 + assert len(grouped_sources[1]) == 1 # 1 source for sub-q 1 + assert grouped_sources[0][0]["filename"] == "file_a.pdf" + assert grouped_sources[1][0]["filename"] == "file_b.pdf" + llm.complete.assert_called_once() + + +# --------------------------------------------------------------------------- +# Test: empty input +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_generate_per_subq_empty_input(): + """Empty sub_questions returns fallback message and empty grouped_sources.""" + llm = MagicMock() + llm.complete = AsyncMock() + + service = RAGService(llm_client=llm) + answer, prompt, grouped_sources = await service.generate_response_per_subquestion( + sub_questions=[], + sub_chunks=[], + sub_metadata=[], + ) + + assert answer == "I could not find any relevant information to answer your question." + assert grouped_sources == [] + llm.complete.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test: sub-questions provided but all chunk lists empty +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_generate_per_subq_no_chunks(): + """Sub-questions provided but all chunk lists empty → fallback message.""" + llm = MagicMock() + llm.complete = AsyncMock() + + service = RAGService(llm_client=llm) + answer, prompt, grouped_sources = await service.generate_response_per_subquestion( + sub_questions=["What is A?", "What is B?"], + sub_chunks=[[], []], + sub_metadata=[[], []], + ) + + assert answer == "I could not find any relevant information to answer your question." + assert grouped_sources == [] + llm.complete.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test: prompt contains context_sections placeholder +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_generate_per_subq_prompt_contains_context_sections(): + """Verify the prompt sent to LLM contains ### Context for Sub-question 0: + header and chunk content.""" + captured_prompt = None + + async def capture_complete(prompt, **kwargs): + nonlocal captured_prompt + captured_prompt = prompt + return "## Sub-question 1: What is A?\n- Answer" + + llm = MagicMock() + llm.complete = AsyncMock(side_effect=capture_complete) + + service = RAGService(llm_client=llm) + await service.generate_response_per_subquestion( + sub_questions=["What is A?"], + sub_chunks=[["chunk text here"]], + sub_metadata=[[{"filename": "file_a.pdf", "page_number": 1, "content_summary": "Sum"}]], + ) + + assert captured_prompt is not None + assert "### Context for Sub-question 0:" in captured_prompt + assert "chunk text here" in captured_prompt + assert "file_a.pdf" in captured_prompt + + +# --------------------------------------------------------------------------- +# Test: LLM client not configured +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_generate_per_subq_llm_not_configured(): + """llm_client=None → returns 'LLM client not configured' message.""" + service = RAGService(llm_client=None) + answer, prompt, grouped_sources = await service.generate_response_per_subquestion( + sub_questions=["What is A?"], + sub_chunks=[["some chunk"]], + sub_metadata=[[{"filename": "a.pdf"}]], + ) + + assert answer == "LLM client not configured." + assert grouped_sources == [] diff --git a/backend/app/test/test_phase4_history_format.py b/backend/app/test/test_phase4_history_format.py new file mode 100644 index 0000000..04c8643 --- /dev/null +++ b/backend/app/test/test_phase4_history_format.py @@ -0,0 +1,175 @@ +"""Tests for Phase 4.4 — New history recording formats. + +Covers: +- sources JSON grouped by sub-question (list-of-lists) +- Per-sub-question XML chunk formatting (retrieved & filtered) +- New per-sub-question count columns in query_history table +- Backward compatibility: old records have NULL in new columns +""" +import json +import os +import sqlite3 +import tempfile + +import pytest + +from app.core.sqlite_db import init_history_db +from app.routers.query import ( + format_chunks_retrieved_per_subq, + format_chunks_filtered_per_subq, +) + + +# --------------------------------------------------------------------------- +# Test: sources JSON is grouped by sub-question +# --------------------------------------------------------------------------- +def test_sources_json_is_grouped_by_subq(): + """The sources JSON stored in history must be a list-of-lists format. + + Example: json.dumps([[s1, s2], [s3]]) instead of flat [s1, s2, s3]. + """ + sources_nested = [ + [ + {"filename": "a.pdf", "page_number": 1}, + {"filename": "a.pdf", "page_number": 2}, + ], + [ + {"filename": "b.pdf", "page_number": 1}, + ], + ] + sources_json = json.dumps(sources_nested) + + # Parse it back — must be a list of lists + parsed = json.loads(sources_json) + assert isinstance(parsed, list) + assert len(parsed) == 2 + assert isinstance(parsed[0], list) + assert len(parsed[0]) == 2 + assert isinstance(parsed[1], list) + assert len(parsed[1]) == 1 + assert parsed[0][0]["filename"] == "a.pdf" + assert parsed[1][0]["filename"] == "b.pdf" + + +# --------------------------------------------------------------------------- +# Test: chunks retrieved XML has sub_q wrappers +# --------------------------------------------------------------------------- +def test_chunks_retrieved_xml_has_subq_wrappers(): + """format_chunks_retrieved_per_subq must produce + wrappers around chunk elements.""" + results = [ + ("What is A?", [ + ("chunk A1 text", {"filename": "a.pdf", "page_number": 1}, 0.5), + ]), + ("What is B?", [ + ("chunk B1 text", {"filename": "b.pdf", "page_number": 3}, 0.8), + ("chunk B2 text", {"filename": "b.pdf", "page_number": 4}, 0.9), + ]), + ] + + xml = format_chunks_retrieved_per_subq(results) + + assert '' in xml + assert '' in xml + assert "" in xml + assert "chunk A1 text" in xml + assert "chunk B1 text" in xml + assert "chunk B2 text" in xml + assert "a.pdf" in xml + assert "b.pdf" in xml + + +# --------------------------------------------------------------------------- +# Test: chunks filtered XML has sub_q wrappers +# --------------------------------------------------------------------------- +def test_chunks_filtered_xml_has_subq_wrappers(): + """format_chunks_filtered_per_subq must produce + wrappers around chunk elements with relevance scores.""" + results = [ + ("What is X?", [ + ("filtered X1", {"filename": "x.pdf", "page_number": 1, "relevance_score": 8}), + ]), + ("What is Y?", [ + ("filtered Y1", {"filename": "y.pdf", "page_number": 2, "relevance_score": 9}), + ]), + ] + + xml = format_chunks_filtered_per_subq(results) + + assert '' in xml + assert '' in xml + assert "" in xml + assert "filtered X1" in xml + assert "filtered Y1" in xml + assert "Relevance: 8" in xml + assert "Relevance: 9" in xml + + +# --------------------------------------------------------------------------- +# Test: per-subq count columns exist in query_history table +# --------------------------------------------------------------------------- +def test_per_subq_count_columns_exist(): + """Verify that chunks_retrieved_per_subq_count and + chunks_filtered_per_subq_count columns exist in query_history.""" + db_dir = tempfile.mkdtemp() + db_path = os.path.join(db_dir, "test_history.db") + + conn = sqlite3.connect(db_path) + init_history_db(conn) + + # Use PRAGMA table_info to get column names + cursor = conn.execute("PRAGMA table_info(query_history)") + columns = {row[1] for row in cursor.fetchall()} + conn.close() + + assert "chunks_retrieved_per_subq_count" in columns, ( + f"chunks_retrieved_per_subq_count not found. Columns: {columns}" + ) + assert "chunks_filtered_per_subq_count" in columns, ( + f"chunks_filtered_per_subq_count not found. Columns: {columns}" + ) + + +# --------------------------------------------------------------------------- +# Test: existing records have NULL per-subq counts (migration safety) +# --------------------------------------------------------------------------- +def test_existing_records_have_null_per_subq_counts(): + """Pre-migration records should have NULL in the new per-subq columns.""" + db_dir = tempfile.mkdtemp() + db_path = os.path.join(db_dir, "test_history_migrate.db") + + conn = sqlite3.connect(db_path) + conn.execute(""" + CREATE TABLE IF NOT EXISTS query_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + input_text TEXT NOT NULL, + chunks_retrieved_count INTEGER DEFAULT 0, + chunks_filtered_count INTEGER DEFAULT 0, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + # Insert an "old" record WITHOUT the new columns + conn.execute( + "INSERT INTO query_history (input_text) VALUES (?)", + ("old question",), + ) + conn.commit() + + # Now run the migration (init_history_db should add missing columns) + init_history_db(conn) + + # Verify the old record has NULL in the new columns + row = conn.execute("SELECT * FROM query_history WHERE input_text = 'old question'").fetchone() + + # Access by column name via description + col_names = [desc[0] for desc in conn.execute("SELECT * FROM query_history LIMIT 1").description] + + # After migration, new columns should exist + assert "chunks_retrieved_per_subq_count" in col_names + assert "chunks_filtered_per_subq_count" in col_names + + row_dict = dict(zip(col_names, row)) + assert row_dict["chunks_retrieved_per_subq_count"] is None + assert row_dict["chunks_filtered_per_subq_count"] is None + + conn.close() diff --git a/backend/app/test/test_phase4_prompt_templates.py b/backend/app/test/test_phase4_prompt_templates.py new file mode 100644 index 0000000..82356d9 --- /dev/null +++ b/backend/app/test/test_phase4_prompt_templates.py @@ -0,0 +1,66 @@ +"""Tests for Phase 4.4 — Prompt template updates for per-sub-question pipeline. + +Covers: +- generate template uses {context_sections} placeholder (not {context}) +- Built-in fallback template uses {context_sections} +- reset_to_defaults() resets generate to the new template +""" +import os +import tempfile + +import pytest + +from app.core.sqlite_db import init_prompts_db, seed_default_profiles +from app.services.prompt_service import PromptService + + +@pytest.fixture +def prompt_service(): + """Create a PromptService backed by a temp DB with seeded profiles.""" + db_dir = tempfile.mkdtemp() + db_path = os.path.join(db_dir, "test_prompts.db") + + import sqlite3 + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + init_prompts_db(conn) + seed_default_profiles(conn) + conn.close() + + return PromptService(db_path=db_path) + + +def test_generate_template_uses_context_sections(prompt_service): + """When PromptService returns the generate template, {context_sections} + placeholder must be present and old {context} removed.""" + template = prompt_service.get_prompt_template("generate") + + assert "{context_sections}" in template + assert "{context}" not in template + + +def test_builtin_generate_template_has_context_sections(): + """When no prompt_service, the built-in seed template uses + {context_sections} instead of {context}.""" + from app.core.sqlite_db import _SEED_GENERATE + + assert "{context_sections}" in _SEED_GENERATE + assert "{context}" not in _SEED_GENERATE + + +def test_reset_to_defaults_includes_new_generate_template(prompt_service): + """After calling reset_to_defaults() on a profile, the generate step + uses {context_sections} placeholder.""" + profile_name = prompt_service.get_active_profile_name() + + prompt_service.update_prompt(profile_name, "generate", "custom template with {context}") + + modified = prompt_service.get_prompt_template("generate") + assert "{context}" in modified + assert "{context_sections}" not in modified + + prompt_service.reset_to_defaults(profile_name) + + template = prompt_service.get_prompt_template("generate") + assert "{context_sections}" in template + assert "{context}" not in template diff --git a/backend/app/test/test_phase4_response_format.py b/backend/app/test/test_phase4_response_format.py new file mode 100644 index 0000000..88cf7f1 --- /dev/null +++ b/backend/app/test/test_phase4_response_format.py @@ -0,0 +1,124 @@ +"""Tests for per-sub-question response format validation — Phase 4.3. + +Covers answer format invariants: +- Sub-question headers present in markdown +- Citation bracket labels in answer text +- grouped_sources match sub-question boundaries +- Single sub-question still uses header format +""" +import pytest +from unittest.mock import AsyncMock, MagicMock + +from app.services.rag import RAGService + + +# --------------------------------------------------------------------------- +# Test: answer has sub-question headers +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_answer_has_subquestion_headers(): + """Answer string contains ## Sub-question N: headers.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value=( + "## Sub-question 1: First question?\n" + "- Point one [doc.pdf, page 1]\n\n" + "## Sub-question 2: Second question?\n" + "- Point two [doc.pdf, page 2]\n" + )) + + service = RAGService(llm_client=llm) + answer, _prompt, _sources = await service.generate_response_per_subquestion( + sub_questions=["First question?", "Second question?"], + sub_chunks=[["chunk1"], ["chunk2"]], + sub_metadata=[ + [{"filename": "doc.pdf", "page_number": 1}], + [{"filename": "doc.pdf", "page_number": 2}], + ], + ) + + assert "## Sub-question 1:" in answer + assert "## Sub-question 2:" in answer + + +# --------------------------------------------------------------------------- +# Test: citations use bracket labels +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_answer_citations_use_bracket_labels(): + """Answer contains [filename, page N] citation format.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value=( + "## Sub-question 1: What is X?\n" + "- X is defined as a variable [report.pdf, page 5]\n" + )) + + service = RAGService(llm_client=llm) + answer, _prompt, _sources = await service.generate_response_per_subquestion( + sub_questions=["What is X?"], + sub_chunks=[["chunk about X"]], + sub_metadata=[[{"filename": "report.pdf", "page_number": 5}]], + ) + + assert "[report.pdf, page 5]" in answer + + +# --------------------------------------------------------------------------- +# Test: grouped_sources match sub-question boundaries +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_grouped_sources_match_subquestions(): + """Each sub-question's source list only contains metadata from its own chunks.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value=( + "## Sub-question 1: Q1?\n- A1\n\n## Sub-question 2: Q2?\n- A2\n" + )) + + service = RAGService(llm_client=llm) + _answer, _prompt, grouped_sources = await service.generate_response_per_subquestion( + sub_questions=["Q1?", "Q2?"], + sub_chunks=[ + ["chunk_alpha", "chunk_beta"], + ["chunk_gamma"], + ], + sub_metadata=[ + [ + {"filename": "alpha.pdf", "page_number": 1}, + {"filename": "beta.pdf", "page_number": 2}, + ], + [ + {"filename": "gamma.pdf", "page_number": 3}, + ], + ], + ) + + assert len(grouped_sources) == 2 + # Sub-q 0 sources should only contain alpha and beta + filenames_0 = {m["filename"] for m in grouped_sources[0]} + assert filenames_0 == {"alpha.pdf", "beta.pdf"} + # Sub-q 1 sources should only contain gamma + filenames_1 = {m["filename"] for m in grouped_sources[1]} + assert filenames_1 == {"gamma.pdf"} + + +# --------------------------------------------------------------------------- +# Test: single sub-question still uses header format +# --------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_single_subquestion_format(): + """When only one sub-question, answer still uses ## Sub-question 1: header.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value=( + "## Sub-question 1: What is this?\n" + "- It is a test [test.pdf, page 1]\n" + )) + + service = RAGService(llm_client=llm) + answer, _prompt, grouped_sources = await service.generate_response_per_subquestion( + sub_questions=["What is this?"], + sub_chunks=[["test chunk"]], + sub_metadata=[[{"filename": "test.pdf", "page_number": 1}]], + ) + + assert "## Sub-question 1:" in answer + assert len(grouped_sources) == 1 + assert len(grouped_sources[0]) == 1