From dd98fa0b65d6109629334fece099aa34637e9fcd Mon Sep 17 00:00:00 2001
From: Woody <woody.ck.tse@gmail.com>
Date: Sun, 26 Apr 2026 23:28:58 +0800
Subject: [PATCH] test(backend): add Phase 4 unit tests for generate, format,
 history, prompts

9 tests for generate_response_per_subquestion() and answer format validation covering multi-sub-q, empty, prompt construction, and markdown format. 8 tests for new history XML/JSON formats (sources as list-of-lists, <sub_q> wrappers in XML) and new {context_sections} prompt template.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 .../app/test/test_phase4_generate_per_subq.py | 149 +++++++++++++++
 .../app/test/test_phase4_history_format.py    | 175 ++++++++++++++++++
 .../app/test/test_phase4_prompt_templates.py  |  66 +++++++
 .../app/test/test_phase4_response_format.py   | 124 +++++++++++++
 4 files changed, 514 insertions(+)
 create mode 100644 backend/app/test/test_phase4_generate_per_subq.py
 create mode 100644 backend/app/test/test_phase4_history_format.py
 create mode 100644 backend/app/test/test_phase4_prompt_templates.py
 create mode 100644 backend/app/test/test_phase4_response_format.py

diff --git a/backend/app/test/test_phase4_generate_per_subq.py b/backend/app/test/test_phase4_generate_per_subq.py
new file mode 100644
index 0000000..1b843c7
--- /dev/null
+++ b/backend/app/test/test_phase4_generate_per_subq.py
@@ -0,0 +1,149 @@
+"""Tests for RAGService.generate_response_per_subquestion() — Phase 4.3.
+
+Covers sub-question-organized response generation:
+- Two sub-questions with mixed chunk counts
+- Empty input handling
+- All-empty chunks fallback
+- Prompt contains context_sections placeholder
+- LLM client not configured fallback
+"""
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+
+from app.services.rag import RAGService
+
+
+# ---------------------------------------------------------------------------
+# Test: two sub-questions, LLM returns markdown with headers
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_generate_per_subq_two_questions():
+    """Two sub-questions, 2 chunks for first, 1 for second.
+
+    LLM returns markdown with ## Sub-question 1/2 headers.
+    Assert answer contains both headers and grouped_sources has correct shape.
+    """
+    llm = MagicMock()
+    llm.complete = AsyncMock(return_value=(
+        "## Sub-question 1: What is A?\n"
+        "- Bullet point A1 [file_a.pdf, page 1]\n"
+        "- Bullet point A2 [file_a.pdf, page 2]\n\n"
+        "## Sub-question 2: What is B?\n"
+        "- Bullet point B1 [file_b.pdf, page 1]\n"
+    ))
+
+    service = RAGService(llm_client=llm)
+    answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
+        sub_questions=["What is A?", "What is B?"],
+        sub_chunks=[
+            ["chunk A1 text", "chunk A2 text"],
+            ["chunk B1 text"],
+        ],
+        sub_metadata=[
+            [
+                {"filename": "file_a.pdf", "page_number": 1, "content_summary": "Summary A1"},
+                {"filename": "file_a.pdf", "page_number": 2, "content_summary": "Summary A2"},
+            ],
+            [
+                {"filename": "file_b.pdf", "page_number": 1, "content_summary": "Summary B1"},
+            ],
+        ],
+    )
+
+    assert "## Sub-question 1: What is A?" in answer
+    assert "## Sub-question 2: What is B?" in answer
+    assert len(grouped_sources) == 2
+    assert len(grouped_sources[0]) == 2  # 2 sources for sub-q 0
+    assert len(grouped_sources[1]) == 1  # 1 source for sub-q 1
+    assert grouped_sources[0][0]["filename"] == "file_a.pdf"
+    assert grouped_sources[1][0]["filename"] == "file_b.pdf"
+    llm.complete.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# Test: empty input
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_generate_per_subq_empty_input():
+    """Empty sub_questions returns fallback message and empty grouped_sources."""
+    llm = MagicMock()
+    llm.complete = AsyncMock()
+
+    service = RAGService(llm_client=llm)
+    answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
+        sub_questions=[],
+        sub_chunks=[],
+        sub_metadata=[],
+    )
+
+    assert answer == "I could not find any relevant information to answer your question."
+    assert grouped_sources == []
+    llm.complete.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Test: sub-questions provided but all chunk lists empty
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_generate_per_subq_no_chunks():
+    """Sub-questions provided but all chunk lists empty → fallback message."""
+    llm = MagicMock()
+    llm.complete = AsyncMock()
+
+    service = RAGService(llm_client=llm)
+    answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
+        sub_questions=["What is A?", "What is B?"],
+        sub_chunks=[[], []],
+        sub_metadata=[[], []],
+    )
+
+    assert answer == "I could not find any relevant information to answer your question."
+    assert grouped_sources == []
+    llm.complete.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Test: prompt contains context_sections placeholder
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_generate_per_subq_prompt_contains_context_sections():
+    """Verify the prompt sent to LLM contains ### Context for Sub-question 0:
+    header and chunk content."""
+    captured_prompt = None
+
+    async def capture_complete(prompt, **kwargs):
+        nonlocal captured_prompt
+        captured_prompt = prompt
+        return "## Sub-question 1: What is A?\n- Answer"
+
+    llm = MagicMock()
+    llm.complete = AsyncMock(side_effect=capture_complete)
+
+    service = RAGService(llm_client=llm)
+    await service.generate_response_per_subquestion(
+        sub_questions=["What is A?"],
+        sub_chunks=[["chunk text here"]],
+        sub_metadata=[[{"filename": "file_a.pdf", "page_number": 1, "content_summary": "Sum"}]],
+    )
+
+    assert captured_prompt is not None
+    assert "### Context for Sub-question 0:" in captured_prompt
+    assert "chunk text here" in captured_prompt
+    assert "file_a.pdf" in captured_prompt
+
+
+# ---------------------------------------------------------------------------
+# Test: LLM client not configured
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_generate_per_subq_llm_not_configured():
+    """llm_client=None → returns 'LLM client not configured' message."""
+    service = RAGService(llm_client=None)
+    answer, prompt, grouped_sources = await service.generate_response_per_subquestion(
+        sub_questions=["What is A?"],
+        sub_chunks=[["some chunk"]],
+        sub_metadata=[[{"filename": "a.pdf"}]],
+    )
+
+    assert answer == "LLM client not configured."
+    assert grouped_sources == []
diff --git a/backend/app/test/test_phase4_history_format.py b/backend/app/test/test_phase4_history_format.py
new file mode 100644
index 0000000..04c8643
--- /dev/null
+++ b/backend/app/test/test_phase4_history_format.py
@@ -0,0 +1,175 @@
+"""Tests for Phase 4.4 — New history recording formats.
+
+Covers:
+- sources JSON grouped by sub-question (list-of-lists)
+- Per-sub-question XML chunk formatting (retrieved & filtered)
+- New per-sub-question count columns in query_history table
+- Backward compatibility: old records have NULL in new columns
+"""
+import json
+import os
+import sqlite3
+import tempfile
+
+import pytest
+
+from app.core.sqlite_db import init_history_db
+from app.routers.query import (
+    format_chunks_retrieved_per_subq,
+    format_chunks_filtered_per_subq,
+)
+
+
+# ---------------------------------------------------------------------------
+# Test: sources JSON is grouped by sub-question
+# ---------------------------------------------------------------------------
+def test_sources_json_is_grouped_by_subq():
+    """The sources JSON stored in history must be a list-of-lists format.
+
+    Example: json.dumps([[s1, s2], [s3]]) instead of flat [s1, s2, s3].
+    """
+    sources_nested = [
+        [
+            {"filename": "a.pdf", "page_number": 1},
+            {"filename": "a.pdf", "page_number": 2},
+        ],
+        [
+            {"filename": "b.pdf", "page_number": 1},
+        ],
+    ]
+    sources_json = json.dumps(sources_nested)
+
+    # Parse it back — must be a list of lists
+    parsed = json.loads(sources_json)
+    assert isinstance(parsed, list)
+    assert len(parsed) == 2
+    assert isinstance(parsed[0], list)
+    assert len(parsed[0]) == 2
+    assert isinstance(parsed[1], list)
+    assert len(parsed[1]) == 1
+    assert parsed[0][0]["filename"] == "a.pdf"
+    assert parsed[1][0]["filename"] == "b.pdf"
+
+
+# ---------------------------------------------------------------------------
+# Test: chunks retrieved XML has sub_q wrappers
+# ---------------------------------------------------------------------------
+def test_chunks_retrieved_xml_has_subq_wrappers():
+    """format_chunks_retrieved_per_subq must produce <sub_q idx="0" question="...">
+    wrappers around chunk elements."""
+    results = [
+        ("What is A?", [
+            ("chunk A1 text", {"filename": "a.pdf", "page_number": 1}, 0.5),
+        ]),
+        ("What is B?", [
+            ("chunk B1 text", {"filename": "b.pdf", "page_number": 3}, 0.8),
+            ("chunk B2 text", {"filename": "b.pdf", "page_number": 4}, 0.9),
+        ]),
+    ]
+
+    xml = format_chunks_retrieved_per_subq(results)
+
+    assert '<sub_q idx="0" question="What is A?">' in xml
+    assert '<sub_q idx="1" question="What is B?">' in xml
+    assert "</sub_q>" in xml
+    assert "chunk A1 text" in xml
+    assert "chunk B1 text" in xml
+    assert "chunk B2 text" in xml
+    assert "a.pdf" in xml
+    assert "b.pdf" in xml
+
+
+# ---------------------------------------------------------------------------
+# Test: chunks filtered XML has sub_q wrappers
+# ---------------------------------------------------------------------------
+def test_chunks_filtered_xml_has_subq_wrappers():
+    """format_chunks_filtered_per_subq must produce <sub_q idx="0" question="...">
+    wrappers around chunk elements with relevance scores."""
+    results = [
+        ("What is X?", [
+            ("filtered X1", {"filename": "x.pdf", "page_number": 1, "relevance_score": 8}),
+        ]),
+        ("What is Y?", [
+            ("filtered Y1", {"filename": "y.pdf", "page_number": 2, "relevance_score": 9}),
+        ]),
+    ]
+
+    xml = format_chunks_filtered_per_subq(results)
+
+    assert '<sub_q idx="0" question="What is X?">' in xml
+    assert '<sub_q idx="1" question="What is Y?">' in xml
+    assert "</sub_q>" in xml
+    assert "filtered X1" in xml
+    assert "filtered Y1" in xml
+    assert "Relevance: 8" in xml
+    assert "Relevance: 9" in xml
+
+
+# ---------------------------------------------------------------------------
+# Test: per-subq count columns exist in query_history table
+# ---------------------------------------------------------------------------
+def test_per_subq_count_columns_exist():
+    """Verify that chunks_retrieved_per_subq_count and
+    chunks_filtered_per_subq_count columns exist in query_history."""
+    db_dir = tempfile.mkdtemp()
+    db_path = os.path.join(db_dir, "test_history.db")
+
+    conn = sqlite3.connect(db_path)
+    init_history_db(conn)
+
+    # Use PRAGMA table_info to get column names
+    cursor = conn.execute("PRAGMA table_info(query_history)")
+    columns = {row[1] for row in cursor.fetchall()}
+    conn.close()
+
+    assert "chunks_retrieved_per_subq_count" in columns, (
+        f"chunks_retrieved_per_subq_count not found. Columns: {columns}"
+    )
+    assert "chunks_filtered_per_subq_count" in columns, (
+        f"chunks_filtered_per_subq_count not found. Columns: {columns}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test: existing records have NULL per-subq counts (migration safety)
+# ---------------------------------------------------------------------------
+def test_existing_records_have_null_per_subq_counts():
+    """Pre-migration records should have NULL in the new per-subq columns."""
+    db_dir = tempfile.mkdtemp()
+    db_path = os.path.join(db_dir, "test_history_migrate.db")
+
+    conn = sqlite3.connect(db_path)
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS query_history (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            input_text TEXT NOT NULL,
+            chunks_retrieved_count INTEGER DEFAULT 0,
+            chunks_filtered_count INTEGER DEFAULT 0,
+            created_at TEXT NOT NULL DEFAULT (datetime('now'))
+        )
+    """)
+    # Insert an "old" record WITHOUT the new columns
+    conn.execute(
+        "INSERT INTO query_history (input_text) VALUES (?)",
+        ("old question",),
+    )
+    conn.commit()
+
+    # Now run the migration (init_history_db should add missing columns)
+    init_history_db(conn)
+
+    # Verify the old record has NULL in the new columns
+    row = conn.execute("SELECT * FROM query_history WHERE input_text = 'old question'").fetchone()
+
+    # Access by column name via description
+    col_names = [desc[0] for desc in conn.execute("SELECT * FROM query_history LIMIT 1").description]
+
+    # After migration, new columns should exist
+    assert "chunks_retrieved_per_subq_count" in col_names
+    assert "chunks_filtered_per_subq_count" in col_names
+
+    row_dict = dict(zip(col_names, row))
+    assert row_dict["chunks_retrieved_per_subq_count"] is None
+    assert row_dict["chunks_filtered_per_subq_count"] is None
+
+    conn.close()
diff --git a/backend/app/test/test_phase4_prompt_templates.py b/backend/app/test/test_phase4_prompt_templates.py
new file mode 100644
index 0000000..82356d9
--- /dev/null
+++ b/backend/app/test/test_phase4_prompt_templates.py
@@ -0,0 +1,66 @@
+"""Tests for Phase 4.4 — Prompt template updates for per-sub-question pipeline.
+
+Covers:
+- generate template uses {context_sections} placeholder (not {context})
+- Built-in fallback template uses {context_sections}
+- reset_to_defaults() resets generate to the new template
+"""
+import os
+import tempfile
+
+import pytest
+
+from app.core.sqlite_db import init_prompts_db, seed_default_profiles
+from app.services.prompt_service import PromptService
+
+
+@pytest.fixture
+def prompt_service():
+    """Create a PromptService backed by a temp DB with seeded profiles."""
+    db_dir = tempfile.mkdtemp()
+    db_path = os.path.join(db_dir, "test_prompts.db")
+
+    import sqlite3
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    init_prompts_db(conn)
+    seed_default_profiles(conn)
+    conn.close()
+
+    return PromptService(db_path=db_path)
+
+
+def test_generate_template_uses_context_sections(prompt_service):
+    """When PromptService returns the generate template, {context_sections}
+    placeholder must be present and old {context} removed."""
+    template = prompt_service.get_prompt_template("generate")
+
+    assert "{context_sections}" in template
+    assert "{context}" not in template
+
+
+def test_builtin_generate_template_has_context_sections():
+    """When no prompt_service, the built-in seed template uses
+    {context_sections} instead of {context}."""
+    from app.core.sqlite_db import _SEED_GENERATE
+
+    assert "{context_sections}" in _SEED_GENERATE
+    assert "{context}" not in _SEED_GENERATE
+
+
+def test_reset_to_defaults_includes_new_generate_template(prompt_service):
+    """After calling reset_to_defaults() on a profile, the generate step
+    uses {context_sections} placeholder."""
+    profile_name = prompt_service.get_active_profile_name()
+
+    prompt_service.update_prompt(profile_name, "generate", "custom template with {context}")
+
+    modified = prompt_service.get_prompt_template("generate")
+    assert "{context}" in modified
+    assert "{context_sections}" not in modified
+
+    prompt_service.reset_to_defaults(profile_name)
+
+    template = prompt_service.get_prompt_template("generate")
+    assert "{context_sections}" in template
+    assert "{context}" not in template
diff --git a/backend/app/test/test_phase4_response_format.py b/backend/app/test/test_phase4_response_format.py
new file mode 100644
index 0000000..88cf7f1
--- /dev/null
+++ b/backend/app/test/test_phase4_response_format.py
@@ -0,0 +1,124 @@
+"""Tests for per-sub-question response format validation — Phase 4.3.
+
+Covers answer format invariants:
+- Sub-question headers present in markdown
+- Citation bracket labels in answer text
+- grouped_sources match sub-question boundaries
+- Single sub-question still uses header format
+"""
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+
+from app.services.rag import RAGService
+
+
+# ---------------------------------------------------------------------------
+# Test: answer has sub-question headers
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_answer_has_subquestion_headers():
+    """Answer string contains ## Sub-question N: headers."""
+    llm = MagicMock()
+    llm.complete = AsyncMock(return_value=(
+        "## Sub-question 1: First question?\n"
+        "- Point one [doc.pdf, page 1]\n\n"
+        "## Sub-question 2: Second question?\n"
+        "- Point two [doc.pdf, page 2]\n"
+    ))
+
+    service = RAGService(llm_client=llm)
+    answer, _prompt, _sources = await service.generate_response_per_subquestion(
+        sub_questions=["First question?", "Second question?"],
+        sub_chunks=[["chunk1"], ["chunk2"]],
+        sub_metadata=[
+            [{"filename": "doc.pdf", "page_number": 1}],
+            [{"filename": "doc.pdf", "page_number": 2}],
+        ],
+    )
+
+    assert "## Sub-question 1:" in answer
+    assert "## Sub-question 2:" in answer
+
+
+# ---------------------------------------------------------------------------
+# Test: citations use bracket labels
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_answer_citations_use_bracket_labels():
+    """Answer contains [filename, page N] citation format."""
+    llm = MagicMock()
+    llm.complete = AsyncMock(return_value=(
+        "## Sub-question 1: What is X?\n"
+        "- X is defined as a variable [report.pdf, page 5]\n"
+    ))
+
+    service = RAGService(llm_client=llm)
+    answer, _prompt, _sources = await service.generate_response_per_subquestion(
+        sub_questions=["What is X?"],
+        sub_chunks=[["chunk about X"]],
+        sub_metadata=[[{"filename": "report.pdf", "page_number": 5}]],
+    )
+
+    assert "[report.pdf, page 5]" in answer
+
+
+# ---------------------------------------------------------------------------
+# Test: grouped_sources match sub-question boundaries
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_grouped_sources_match_subquestions():
+    """Each sub-question's source list only contains metadata from its own chunks."""
+    llm = MagicMock()
+    llm.complete = AsyncMock(return_value=(
+        "## Sub-question 1: Q1?\n- A1\n\n## Sub-question 2: Q2?\n- A2\n"
+    ))
+
+    service = RAGService(llm_client=llm)
+    _answer, _prompt, grouped_sources = await service.generate_response_per_subquestion(
+        sub_questions=["Q1?", "Q2?"],
+        sub_chunks=[
+            ["chunk_alpha", "chunk_beta"],
+            ["chunk_gamma"],
+        ],
+        sub_metadata=[
+            [
+                {"filename": "alpha.pdf", "page_number": 1},
+                {"filename": "beta.pdf", "page_number": 2},
+            ],
+            [
+                {"filename": "gamma.pdf", "page_number": 3},
+            ],
+        ],
+    )
+
+    assert len(grouped_sources) == 2
+    # Sub-q 0 sources should only contain alpha and beta
+    filenames_0 = {m["filename"] for m in grouped_sources[0]}
+    assert filenames_0 == {"alpha.pdf", "beta.pdf"}
+    # Sub-q 1 sources should only contain gamma
+    filenames_1 = {m["filename"] for m in grouped_sources[1]}
+    assert filenames_1 == {"gamma.pdf"}
+
+
+# ---------------------------------------------------------------------------
+# Test: single sub-question still uses header format
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_single_subquestion_format():
+    """When only one sub-question, answer still uses ## Sub-question 1: header."""
+    llm = MagicMock()
+    llm.complete = AsyncMock(return_value=(
+        "## Sub-question 1: What is this?\n"
+        "- It is a test [test.pdf, page 1]\n"
+    ))
+
+    service = RAGService(llm_client=llm)
+    answer, _prompt, grouped_sources = await service.generate_response_per_subquestion(
+        sub_questions=["What is this?"],
+        sub_chunks=[["test chunk"]],
+        sub_metadata=[[{"filename": "test.pdf", "page_number": 1}]],
+    )
+
+    assert "## Sub-question 1:" in answer
+    assert len(grouped_sources) == 1
+    assert len(grouped_sources[0]) == 1