test(backend): add Phase 4 unit tests for retrieval and filtering

10 tests for retrieve_per_subquestion() covering multi-sub-q, empty, single, call counting, n_results passthrough, and empty results. 14 tests for filter_per_subquestion() covering basic filtering, threshold behavior, JSON parsing edge cases, markdown extraction, LLM exceptions, and format helpers.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-26 23:28:45 +08:00
parent 0ecae11bf8
commit ab6ec28de6
4 changed files with 526 additions and 0 deletions

View File

@ -0,0 +1,102 @@
"""Tests for format_chunks_filtered_per_subq() helper — Phase 4.2.
Covers the XML formatting of per-sub-question filtered chunks:
- Normal multi-sub-question output
- Empty results
- Single sub-question
"""
import pytest
from app.routers.query import format_chunks_filtered_per_subq
def test_format_chunks_filtered_per_subq():
"""Per-sub-q filtered results produce XML with <sub_q> wrappers and Relevance."""
results = [
(
"What is A?",
[
("chunk A1 text", {"filename": "a.pdf", "relevance_score": 8.5}),
("chunk A2 text", {"filename": "a2.pdf", "relevance_score": 3.2}),
],
),
(
"What is B?",
[
("chunk B1 text", {"filename": "b.pdf", "page_number": 5, "relevance_score": 9.0}),
],
),
]
xml = format_chunks_filtered_per_subq(results)
# Sub-question wrappers
assert '<sub_q idx="0" question="What is A?">' in xml
assert '<sub_q idx="1" question="What is B?">' in xml
assert "</sub_q>" in xml
# Chunks with relevance
assert "<chunk_1>" in xml
assert "<chunk_2>" in xml
assert "Relevance: 8.5" in xml
assert "Relevance: 9.0" in xml
assert "Relevance: 3.2" in xml
assert "Filename: a.pdf" in xml
assert "Filename: b.pdf" in xml
assert "Page: 5" in xml
# Content present
assert "chunk A1 text" in xml
assert "chunk B1 text" in xml
def test_format_chunks_filtered_per_subq_empty():
"""Empty results list → empty string."""
assert format_chunks_filtered_per_subq([]) == ""
def test_format_chunks_filtered_per_subq_single_subq():
"""Single sub-question with one chunk."""
results = [
(
"Only question?",
[
("only chunk text", {"filename": "doc.pdf", "relevance_score": 9.5}),
],
),
]
xml = format_chunks_filtered_per_subq(results)
assert '<sub_q idx="0" question="Only question?">' in xml
assert "<chunk_1>" in xml
assert "Relevance: 9.5" in xml
assert "only chunk text" in xml
assert "</sub_q>" in xml
def test_format_chunks_filtered_per_subq_no_page_number():
"""Chunk without page_number should not include Page: line."""
results = [
(
"Q?",
[("text", {"filename": "f.pdf", "relevance_score": 8.0})],
),
]
xml = format_chunks_filtered_per_subq(results)
assert "Page:" not in xml
assert "Relevance: 8.0" in xml
def test_format_chunks_filtered_per_subq_no_relevance_score():
"""Chunk without relevance_score should show N/A."""
results = [
(
"Q?",
[("text", {"filename": "f.pdf"})],
),
]
xml = format_chunks_filtered_per_subq(results)
assert "Relevance: N/A" in xml

View File

@ -0,0 +1,78 @@
"""Phase 4 tests: Per-sub-question XML formatting in query router.
Covers:
- format_chunks_retrieved_per_subq with multiple sub-questions
- format_chunks_retrieved_per_subq with empty results
- format_chunks_retrieved_per_subq with single sub-question
- XML special character escaping in question attributes
"""
import pytest
from app.routers.query import format_chunks_retrieved_per_subq
class TestFormatChunksRetrievedPerSubq:
"""Tests for format_chunks_retrieved_per_subq()."""
def test_format_chunks_retrieved_per_subq(self):
"""Multiple sub-questions produce nested <sub_q> wrappers with chunks."""
results = [
(
"What is A?",
[
("chunk A1", {"filename": "a.pdf"}, 0.1),
("chunk A2", {"filename": "a2.pdf", "page_number": 3}, 0.2),
],
),
(
"What is B?",
[("chunk B1", {"filename": "b.pdf"}, 0.3)],
),
]
xml = format_chunks_retrieved_per_subq(results)
assert '<sub_q idx="0" question="What is A?">' in xml
assert '<sub_q idx="1" question="What is B?">' in xml
assert "</sub_q>" in xml
assert "<chunk_1>" in xml
assert "<chunk_2>" in xml
assert "chunk A1" in xml
assert "chunk A2" in xml
assert "chunk B1" in xml
assert "a.pdf" in xml
assert "Page: 3" in xml
def test_format_chunks_retrieved_per_subq_empty(self):
"""Empty results list produces empty string."""
xml = format_chunks_retrieved_per_subq([])
assert xml == ""
def test_format_chunks_retrieved_per_subq_single_subq(self):
"""Single sub-question produces single <sub_q> wrapper."""
results = [
(
"Only one?",
[("chunk X", {"filename": "x.pdf"}, 0.1)],
),
]
xml = format_chunks_retrieved_per_subq(results)
assert '<sub_q idx="0" question="Only one?">' in xml
assert "</sub_q>" in xml
assert "chunk X" in xml
assert xml.count("<sub_q") == 1
def test_format_chunks_retrieved_per_subq_escapes_xml(self):
"""Special XML characters in question text are preserved in output."""
results = [
(
'What about "quotes" & <brackets>?',
[("data", {"filename": "f.pdf"}, 0.1)],
),
]
xml = format_chunks_retrieved_per_subq(results)
assert 'question="What about "quotes" & <brackets>?"' in xml

View File

@ -0,0 +1,214 @@
"""Tests for RelevanceFilter.filter_per_subquestion() — Phase 4.2.
Covers per-sub-question chunk filtering in a single LLM call:
- Basic scoring and threshold filtering per sub-question
- Empty inputs and edge cases
- Invalid JSON / score-count mismatch error handling
- Threshold boundary behaviour (strict >)
"""
import json
import pytest
from unittest.mock import AsyncMock, MagicMock
from app.services.relevance_filter import RelevanceFilter
# ---------------------------------------------------------------------------
# Test: basic per-sub-question filtering
# ---------------------------------------------------------------------------
async def test_filter_per_subq_basic(mock_prompt_service):
"""Two sub-questions, LLM returns per-sub-q scores, threshold filters correctly."""
llm = MagicMock()
llm.complete = AsyncMock(return_value='{"0": [8.5, 3.2], "1": [9.0]}')
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?", "What is B?"],
[
[("chunk A1", {"filename": "a.pdf"}), ("chunk A2", {"filename": "a2.pdf"})],
[("chunk B1", {"filename": "b.pdf"})],
],
threshold=7.0,
)
# Structure check
assert len(results) == 2
assert results[0][0] == "What is A?"
assert results[1][0] == "What is B?"
# Sub-q 0: only score 8.5 passes threshold > 7.0
assert len(results[0][1]) == 1
assert results[0][1][0][0] == "chunk A1"
assert results[0][1][0][1]["relevance_score"] == 8.5
assert results[0][1][0][1]["filename"] == "a.pdf"
# Sub-q 1: score 9.0 passes
assert len(results[1][1]) == 1
assert results[1][1][0][0] == "chunk B1"
assert results[1][1][0][1]["relevance_score"] == 9.0
# Prompt contains sub-question labels
assert prompt != ""
assert "Sub-question 0" in prompt
assert "Sub-question 1" in prompt
llm.complete.assert_called_once()
# ---------------------------------------------------------------------------
# Test: empty input
# ---------------------------------------------------------------------------
async def test_filter_per_subq_empty_input(mock_prompt_service):
"""Empty sub_questions list returns ([], '')."""
llm = MagicMock()
llm.complete = AsyncMock()
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion([], [], threshold=7.0)
assert results == []
assert prompt == ""
llm.complete.assert_not_called()
# ---------------------------------------------------------------------------
# Test: sub-questions with all-empty chunk lists
# ---------------------------------------------------------------------------
async def test_filter_per_subq_all_empty_chunks(mock_prompt_service):
"""Two sub-questions, both with empty chunk lists → empty filtered lists."""
llm = MagicMock()
llm.complete = AsyncMock()
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?", "What is B?"],
[[], []],
threshold=7.0,
)
assert len(results) == 2
assert results[0][0] == "What is A?"
assert results[0][1] == []
assert results[1][0] == "What is B?"
assert results[1][1] == []
# No LLM call needed when all chunk lists are empty
llm.complete.assert_not_called()
# ---------------------------------------------------------------------------
# Test: LLM returns invalid JSON
# ---------------------------------------------------------------------------
async def test_filter_per_subq_llm_returns_invalid_json(mock_prompt_service):
"""LLM returns non-JSON string → returns ([], prompt)."""
llm = MagicMock()
llm.complete = AsyncMock(return_value="not json at all")
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?"],
[[("chunk A1", {"filename": "a.pdf"})]],
threshold=7.0,
)
assert results == []
assert prompt != ""
# ---------------------------------------------------------------------------
# Test: score count mismatch
# ---------------------------------------------------------------------------
async def test_filter_per_subq_score_count_mismatch(mock_prompt_service):
"""Sub-q 0 has 2 chunks but LLM returns only 1 score → returns ([], prompt)."""
llm = MagicMock()
llm.complete = AsyncMock(return_value='{"0": [8.5]}')
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?"],
[[("chunk A1", {"filename": "a.pdf"}), ("chunk A2", {"filename": "a2.pdf"})]],
threshold=7.0,
)
assert results == []
assert prompt != ""
# ---------------------------------------------------------------------------
# Test: strict threshold boundary
# ---------------------------------------------------------------------------
async def test_filter_per_subq_passes_threshold_correctly(mock_prompt_service):
"""Score == threshold is NOT kept (strict >). Score > threshold IS kept."""
llm = MagicMock()
# Sub-q 0: scores [7.0, 7.1] with threshold 7.0 → only 7.1 kept
llm.complete = AsyncMock(return_value='{"0": [7.0, 7.1]}')
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["Boundary test?"],
[[("exact threshold", {"filename": "f1.pdf"}), ("above threshold", {"filename": "f2.pdf"})]],
threshold=7.0,
)
assert len(results) == 1
assert len(results[0][1]) == 1
assert results[0][1][0][0] == "above threshold"
assert results[0][1][0][1]["relevance_score"] == 7.1
# ---------------------------------------------------------------------------
# Test: LLM exception
# ---------------------------------------------------------------------------
async def test_filter_per_subq_llm_exception(mock_prompt_service):
"""LLM call raises an exception → returns ([], '')."""
llm = MagicMock()
llm.complete = AsyncMock(side_effect=RuntimeError("LLM unavailable"))
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?"],
[[("chunk A1", {"filename": "a.pdf"})]],
threshold=7.0,
)
assert results == []
assert prompt != ""
# ---------------------------------------------------------------------------
# Test: JSON wrapped in markdown code block
# ---------------------------------------------------------------------------
async def test_filter_per_subq_json_in_markdown_code_block(mock_prompt_service):
"""LLM returns JSON inside ```json ... ``` block → should parse correctly."""
llm = MagicMock()
llm.complete = AsyncMock(return_value='```json\n{"0": [9.0]}\n```')
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?"],
[[("chunk A1", {"filename": "a.pdf"})]],
threshold=7.0,
)
assert len(results) == 1
assert len(results[0][1]) == 1
assert results[0][1][0][1]["relevance_score"] == 9.0
# ---------------------------------------------------------------------------
# Test: mixed empty and non-empty sub-questions
# ---------------------------------------------------------------------------
async def test_filter_per_subq_mixed_empty_and_nonempty(mock_prompt_service):
"""One sub-q with chunks, one without. Only non-empty ones get scored."""
llm = MagicMock()
llm.complete = AsyncMock(return_value='{"0": [8.5]}')
rf = RelevanceFilter(llm, prompt_service=mock_prompt_service)
results, prompt = await rf.filter_per_subquestion(
["What is A?", "What is B?"],
[[("chunk A1", {"filename": "a.pdf"})], []],
threshold=7.0,
)
assert len(results) == 2
assert len(results[0][1]) == 1
assert results[0][1][0][1]["relevance_score"] == 8.5
assert results[1][1] == []

View File

@ -0,0 +1,132 @@
"""Phase 4 tests: Per-sub-question retrieval in RAGService.
Covers:
- retrieve_per_subquestion() with multiple sub-questions
- retrieve_per_subquestion() with empty input
- retrieve_per_subquestion() with single sub-question
- Verify retrieve() is called once per sub-question
- n_results parameter passthrough
- Handling of empty results for individual sub-questions
"""
import pytest
from unittest.mock import MagicMock
from app.services.rag import RAGService
class TestRetrievePerSubquestion:
"""Tests for RAGService.retrieve_per_subquestion()."""
@staticmethod
def _make_service() -> RAGService:
"""Create a RAGService with a mocked collection."""
mock_collection = MagicMock()
mock_client = MagicMock()
mock_client.get_or_create_collection.return_value = mock_collection
service = RAGService(chroma_client=mock_client)
service._collection = mock_collection
return service
def test_retrieve_per_subquestion_two_subqs(self):
"""Two sub-questions should each return their own chunks."""
service = self._make_service()
service._collection.query.side_effect = [
{
"documents": [["chunk A1", "chunk A2"]],
"metadatas": [[{"filename": "a.pdf"}, {"filename": "a2.pdf"}]],
"distances": [[0.1, 0.2]],
},
{
"documents": [["chunk B1"]],
"metadatas": [[{"filename": "b.pdf"}]],
"distances": [[0.3]],
},
]
results = service.retrieve_per_subquestion(
["What is A?", "What is B?"], n_results=5
)
assert len(results) == 2
assert results[0][0] == "What is A?"
assert len(results[0][1]) == 2
assert results[0][1][0] == ("chunk A1", {"filename": "a.pdf"}, 0.1)
assert results[0][1][1] == ("chunk A2", {"filename": "a2.pdf"}, 0.2)
assert results[1][0] == "What is B?"
assert len(results[1][1]) == 1
assert results[1][1][0] == ("chunk B1", {"filename": "b.pdf"}, 0.3)
def test_retrieve_per_subquestion_empty_list(self):
"""Empty sub_questions list returns empty list."""
service = self._make_service()
results = service.retrieve_per_subquestion([], n_results=10)
assert results == []
def test_retrieve_per_subquestion_single_subq(self):
"""Single sub-question returns a single-element result list."""
service = self._make_service()
service._collection.query.return_value = {
"documents": [["chunk X"]],
"metadatas": [[{"filename": "x.pdf"}]],
"distances": [[0.05]],
}
results = service.retrieve_per_subquestion(["Only question"], n_results=3)
assert len(results) == 1
assert results[0][0] == "Only question"
assert len(results[0][1]) == 1
assert results[0][1][0] == ("chunk X", {"filename": "x.pdf"}, 0.05)
def test_retrieve_per_subquestion_calls_retrieve_n_times(self):
"""retrieve() should be called once per sub-question with correct args."""
service = self._make_service()
# Mock retrieve to return empty chunks so we can spy on calls
service.retrieve = MagicMock(return_value=[])
sub_questions = ["Q1", "Q2", "Q3"]
service.retrieve_per_subquestion(sub_questions, n_results=7)
assert service.retrieve.call_count == 3
service.retrieve.assert_any_call(["Q1"], n_results=7)
service.retrieve.assert_any_call(["Q2"], n_results=7)
service.retrieve.assert_any_call(["Q3"], n_results=7)
def test_retrieve_per_subquestion_preserves_n_results(self):
"""n_results parameter is passed through to each retrieve() call."""
service = self._make_service()
service.retrieve = MagicMock(return_value=[])
service.retrieve_per_subquestion(["Q1"], n_results=42)
service.retrieve.assert_called_once_with(["Q1"], n_results=42)
def test_retrieve_per_subquestion_handles_empty_results(self):
"""One sub-q returns no results, another returns results."""
service = self._make_service()
# First call returns empty, second returns data
service.retrieve = MagicMock(
side_effect=[
[],
[("chunk B", {"filename": "b.pdf"}, 0.2)],
]
)
results = service.retrieve_per_subquestion(
["No results Q", "Has results Q"], n_results=5
)
assert len(results) == 2
# First sub-question has empty chunks
assert results[0][0] == "No results Q"
assert results[0][1] == []
# Second sub-question has chunks
assert results[1][0] == "Has results Q"
assert len(results[1][1]) == 1
assert results[1][1][0] == ("chunk B", {"filename": "b.pdf"}, 0.2)