diff --git a/backend/app/test/test_phase4_query_router_filter.py b/backend/app/test/test_phase4_query_router_filter.py new file mode 100644 index 0000000..1f3500a --- /dev/null +++ b/backend/app/test/test_phase4_query_router_filter.py @@ -0,0 +1,102 @@ +"""Tests for format_chunks_filtered_per_subq() helper — Phase 4.2. + +Covers the XML formatting of per-sub-question filtered chunks: +- Normal multi-sub-question output +- Empty results +- Single sub-question +""" +import pytest + +from app.routers.query import format_chunks_filtered_per_subq + + +def test_format_chunks_filtered_per_subq(): + """Per-sub-q filtered results produce XML with wrappers and Relevance.""" + results = [ + ( + "What is A?", + [ + ("chunk A1 text", {"filename": "a.pdf", "relevance_score": 8.5}), + ("chunk A2 text", {"filename": "a2.pdf", "relevance_score": 3.2}), + ], + ), + ( + "What is B?", + [ + ("chunk B1 text", {"filename": "b.pdf", "page_number": 5, "relevance_score": 9.0}), + ], + ), + ] + + xml = format_chunks_filtered_per_subq(results) + + # Sub-question wrappers + assert '' in xml + assert '' in xml + assert "" in xml + + # Chunks with relevance + assert "" in xml + assert "" in xml + assert "Relevance: 8.5" in xml + assert "Relevance: 9.0" in xml + assert "Relevance: 3.2" in xml + assert "Filename: a.pdf" in xml + assert "Filename: b.pdf" in xml + assert "Page: 5" in xml + + # Content present + assert "chunk A1 text" in xml + assert "chunk B1 text" in xml + + +def test_format_chunks_filtered_per_subq_empty(): + """Empty results list → empty string.""" + assert format_chunks_filtered_per_subq([]) == "" + + +def test_format_chunks_filtered_per_subq_single_subq(): + """Single sub-question with one chunk.""" + results = [ + ( + "Only question?", + [ + ("only chunk text", {"filename": "doc.pdf", "relevance_score": 9.5}), + ], + ), + ] + + xml = format_chunks_filtered_per_subq(results) + + assert '' in xml + assert "" in xml + assert "Relevance: 9.5" in xml + assert "only chunk text" in xml + assert "" in xml + + +def test_format_chunks_filtered_per_subq_no_page_number(): + """Chunk without page_number should not include Page: line.""" + results = [ + ( + "Q?", + [("text", {"filename": "f.pdf", "relevance_score": 8.0})], + ), + ] + + xml = format_chunks_filtered_per_subq(results) + assert "Page:" not in xml + assert "Relevance: 8.0" in xml + + +def test_format_chunks_filtered_per_subq_no_relevance_score(): + """Chunk without relevance_score should show N/A.""" + results = [ + ( + "Q?", + [("text", {"filename": "f.pdf"})], + ), + ] + + xml = format_chunks_filtered_per_subq(results) + assert "Relevance: N/A" in xml diff --git a/backend/app/test/test_phase4_query_router_retrieval.py b/backend/app/test/test_phase4_query_router_retrieval.py new file mode 100644 index 0000000..6a0cbb3 --- /dev/null +++ b/backend/app/test/test_phase4_query_router_retrieval.py @@ -0,0 +1,78 @@ +"""Phase 4 tests: Per-sub-question XML formatting in query router. + +Covers: +- format_chunks_retrieved_per_subq with multiple sub-questions +- format_chunks_retrieved_per_subq with empty results +- format_chunks_retrieved_per_subq with single sub-question +- XML special character escaping in question attributes +""" +import pytest + +from app.routers.query import format_chunks_retrieved_per_subq + + +class TestFormatChunksRetrievedPerSubq: + """Tests for format_chunks_retrieved_per_subq().""" + + def test_format_chunks_retrieved_per_subq(self): + """Multiple sub-questions produce nested wrappers with chunks.""" + results = [ + ( + "What is A?", + [ + ("chunk A1", {"filename": "a.pdf"}, 0.1), + ("chunk A2", {"filename": "a2.pdf", "page_number": 3}, 0.2), + ], + ), + ( + "What is B?", + [("chunk B1", {"filename": "b.pdf"}, 0.3)], + ), + ] + + xml = format_chunks_retrieved_per_subq(results) + + assert '' in xml + assert '' in xml + assert "" in xml + assert "" in xml + assert "" in xml + assert "chunk A1" in xml + assert "chunk A2" in xml + assert "chunk B1" in xml + assert "a.pdf" in xml + assert "Page: 3" in xml + + def test_format_chunks_retrieved_per_subq_empty(self): + """Empty results list produces empty string.""" + xml = format_chunks_retrieved_per_subq([]) + assert xml == "" + + def test_format_chunks_retrieved_per_subq_single_subq(self): + """Single sub-question produces single wrapper.""" + results = [ + ( + "Only one?", + [("chunk X", {"filename": "x.pdf"}, 0.1)], + ), + ] + + xml = format_chunks_retrieved_per_subq(results) + + assert '' in xml + assert "" in xml + assert "chunk X" in xml + assert xml.count("?', + [("data", {"filename": "f.pdf"}, 0.1)], + ), + ] + + xml = format_chunks_retrieved_per_subq(results) + + assert 'question="What about "quotes" & ?"' in xml diff --git a/backend/app/test/test_phase4_relevance_filter_per_subq.py b/backend/app/test/test_phase4_relevance_filter_per_subq.py new file mode 100644 index 0000000..fd637df --- /dev/null +++ b/backend/app/test/test_phase4_relevance_filter_per_subq.py @@ -0,0 +1,214 @@ +"""Tests for RelevanceFilter.filter_per_subquestion() — Phase 4.2. + +Covers per-sub-question chunk filtering in a single LLM call: +- Basic scoring and threshold filtering per sub-question +- Empty inputs and edge cases +- Invalid JSON / score-count mismatch error handling +- Threshold boundary behaviour (strict >) +""" +import json +import pytest +from unittest.mock import AsyncMock, MagicMock + +from app.services.relevance_filter import RelevanceFilter + + +# --------------------------------------------------------------------------- +# Test: basic per-sub-question filtering +# --------------------------------------------------------------------------- +async def test_filter_per_subq_basic(mock_prompt_service): + """Two sub-questions, LLM returns per-sub-q scores, threshold filters correctly.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value='{"0": [8.5, 3.2], "1": [9.0]}') + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?", "What is B?"], + [ + [("chunk A1", {"filename": "a.pdf"}), ("chunk A2", {"filename": "a2.pdf"})], + [("chunk B1", {"filename": "b.pdf"})], + ], + threshold=7.0, + ) + + # Structure check + assert len(results) == 2 + assert results[0][0] == "What is A?" + assert results[1][0] == "What is B?" + + # Sub-q 0: only score 8.5 passes threshold > 7.0 + assert len(results[0][1]) == 1 + assert results[0][1][0][0] == "chunk A1" + assert results[0][1][0][1]["relevance_score"] == 8.5 + assert results[0][1][0][1]["filename"] == "a.pdf" + + # Sub-q 1: score 9.0 passes + assert len(results[1][1]) == 1 + assert results[1][1][0][0] == "chunk B1" + assert results[1][1][0][1]["relevance_score"] == 9.0 + + # Prompt contains sub-question labels + assert prompt != "" + assert "Sub-question 0" in prompt + assert "Sub-question 1" in prompt + llm.complete.assert_called_once() + + +# --------------------------------------------------------------------------- +# Test: empty input +# --------------------------------------------------------------------------- +async def test_filter_per_subq_empty_input(mock_prompt_service): + """Empty sub_questions list returns ([], '').""" + llm = MagicMock() + llm.complete = AsyncMock() + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion([], [], threshold=7.0) + + assert results == [] + assert prompt == "" + llm.complete.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test: sub-questions with all-empty chunk lists +# --------------------------------------------------------------------------- +async def test_filter_per_subq_all_empty_chunks(mock_prompt_service): + """Two sub-questions, both with empty chunk lists → empty filtered lists.""" + llm = MagicMock() + llm.complete = AsyncMock() + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?", "What is B?"], + [[], []], + threshold=7.0, + ) + + assert len(results) == 2 + assert results[0][0] == "What is A?" + assert results[0][1] == [] + assert results[1][0] == "What is B?" + assert results[1][1] == [] + # No LLM call needed when all chunk lists are empty + llm.complete.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test: LLM returns invalid JSON +# --------------------------------------------------------------------------- +async def test_filter_per_subq_llm_returns_invalid_json(mock_prompt_service): + """LLM returns non-JSON string → returns ([], prompt).""" + llm = MagicMock() + llm.complete = AsyncMock(return_value="not json at all") + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?"], + [[("chunk A1", {"filename": "a.pdf"})]], + threshold=7.0, + ) + + assert results == [] + assert prompt != "" + + +# --------------------------------------------------------------------------- +# Test: score count mismatch +# --------------------------------------------------------------------------- +async def test_filter_per_subq_score_count_mismatch(mock_prompt_service): + """Sub-q 0 has 2 chunks but LLM returns only 1 score → returns ([], prompt).""" + llm = MagicMock() + llm.complete = AsyncMock(return_value='{"0": [8.5]}') + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?"], + [[("chunk A1", {"filename": "a.pdf"}), ("chunk A2", {"filename": "a2.pdf"})]], + threshold=7.0, + ) + + assert results == [] + assert prompt != "" + + +# --------------------------------------------------------------------------- +# Test: strict threshold boundary +# --------------------------------------------------------------------------- +async def test_filter_per_subq_passes_threshold_correctly(mock_prompt_service): + """Score == threshold is NOT kept (strict >). Score > threshold IS kept.""" + llm = MagicMock() + # Sub-q 0: scores [7.0, 7.1] with threshold 7.0 → only 7.1 kept + llm.complete = AsyncMock(return_value='{"0": [7.0, 7.1]}') + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["Boundary test?"], + [[("exact threshold", {"filename": "f1.pdf"}), ("above threshold", {"filename": "f2.pdf"})]], + threshold=7.0, + ) + + assert len(results) == 1 + assert len(results[0][1]) == 1 + assert results[0][1][0][0] == "above threshold" + assert results[0][1][0][1]["relevance_score"] == 7.1 + + +# --------------------------------------------------------------------------- +# Test: LLM exception +# --------------------------------------------------------------------------- +async def test_filter_per_subq_llm_exception(mock_prompt_service): + """LLM call raises an exception → returns ([], '').""" + llm = MagicMock() + llm.complete = AsyncMock(side_effect=RuntimeError("LLM unavailable")) + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?"], + [[("chunk A1", {"filename": "a.pdf"})]], + threshold=7.0, + ) + + assert results == [] + assert prompt != "" + + +# --------------------------------------------------------------------------- +# Test: JSON wrapped in markdown code block +# --------------------------------------------------------------------------- +async def test_filter_per_subq_json_in_markdown_code_block(mock_prompt_service): + """LLM returns JSON inside ```json ... ``` block → should parse correctly.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value='```json\n{"0": [9.0]}\n```') + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?"], + [[("chunk A1", {"filename": "a.pdf"})]], + threshold=7.0, + ) + + assert len(results) == 1 + assert len(results[0][1]) == 1 + assert results[0][1][0][1]["relevance_score"] == 9.0 + + +# --------------------------------------------------------------------------- +# Test: mixed empty and non-empty sub-questions +# --------------------------------------------------------------------------- +async def test_filter_per_subq_mixed_empty_and_nonempty(mock_prompt_service): + """One sub-q with chunks, one without. Only non-empty ones get scored.""" + llm = MagicMock() + llm.complete = AsyncMock(return_value='{"0": [8.5]}') + + rf = RelevanceFilter(llm, prompt_service=mock_prompt_service) + results, prompt = await rf.filter_per_subquestion( + ["What is A?", "What is B?"], + [[("chunk A1", {"filename": "a.pdf"})], []], + threshold=7.0, + ) + + assert len(results) == 2 + assert len(results[0][1]) == 1 + assert results[0][1][0][1]["relevance_score"] == 8.5 + assert results[1][1] == [] diff --git a/backend/app/test/test_phase4_retrieve_per_subquestion.py b/backend/app/test/test_phase4_retrieve_per_subquestion.py new file mode 100644 index 0000000..2676fef --- /dev/null +++ b/backend/app/test/test_phase4_retrieve_per_subquestion.py @@ -0,0 +1,132 @@ +"""Phase 4 tests: Per-sub-question retrieval in RAGService. + +Covers: +- retrieve_per_subquestion() with multiple sub-questions +- retrieve_per_subquestion() with empty input +- retrieve_per_subquestion() with single sub-question +- Verify retrieve() is called once per sub-question +- n_results parameter passthrough +- Handling of empty results for individual sub-questions +""" +import pytest +from unittest.mock import MagicMock + +from app.services.rag import RAGService + + +class TestRetrievePerSubquestion: + """Tests for RAGService.retrieve_per_subquestion().""" + + @staticmethod + def _make_service() -> RAGService: + """Create a RAGService with a mocked collection.""" + mock_collection = MagicMock() + mock_client = MagicMock() + mock_client.get_or_create_collection.return_value = mock_collection + service = RAGService(chroma_client=mock_client) + service._collection = mock_collection + return service + + def test_retrieve_per_subquestion_two_subqs(self): + """Two sub-questions should each return their own chunks.""" + service = self._make_service() + service._collection.query.side_effect = [ + { + "documents": [["chunk A1", "chunk A2"]], + "metadatas": [[{"filename": "a.pdf"}, {"filename": "a2.pdf"}]], + "distances": [[0.1, 0.2]], + }, + { + "documents": [["chunk B1"]], + "metadatas": [[{"filename": "b.pdf"}]], + "distances": [[0.3]], + }, + ] + + results = service.retrieve_per_subquestion( + ["What is A?", "What is B?"], n_results=5 + ) + + assert len(results) == 2 + + assert results[0][0] == "What is A?" + assert len(results[0][1]) == 2 + assert results[0][1][0] == ("chunk A1", {"filename": "a.pdf"}, 0.1) + assert results[0][1][1] == ("chunk A2", {"filename": "a2.pdf"}, 0.2) + + assert results[1][0] == "What is B?" + assert len(results[1][1]) == 1 + assert results[1][1][0] == ("chunk B1", {"filename": "b.pdf"}, 0.3) + + def test_retrieve_per_subquestion_empty_list(self): + """Empty sub_questions list returns empty list.""" + service = self._make_service() + results = service.retrieve_per_subquestion([], n_results=10) + assert results == [] + + def test_retrieve_per_subquestion_single_subq(self): + """Single sub-question returns a single-element result list.""" + service = self._make_service() + service._collection.query.return_value = { + "documents": [["chunk X"]], + "metadatas": [[{"filename": "x.pdf"}]], + "distances": [[0.05]], + } + + results = service.retrieve_per_subquestion(["Only question"], n_results=3) + + assert len(results) == 1 + assert results[0][0] == "Only question" + assert len(results[0][1]) == 1 + assert results[0][1][0] == ("chunk X", {"filename": "x.pdf"}, 0.05) + + def test_retrieve_per_subquestion_calls_retrieve_n_times(self): + """retrieve() should be called once per sub-question with correct args.""" + service = self._make_service() + + # Mock retrieve to return empty chunks so we can spy on calls + service.retrieve = MagicMock(return_value=[]) + + sub_questions = ["Q1", "Q2", "Q3"] + service.retrieve_per_subquestion(sub_questions, n_results=7) + + assert service.retrieve.call_count == 3 + service.retrieve.assert_any_call(["Q1"], n_results=7) + service.retrieve.assert_any_call(["Q2"], n_results=7) + service.retrieve.assert_any_call(["Q3"], n_results=7) + + def test_retrieve_per_subquestion_preserves_n_results(self): + """n_results parameter is passed through to each retrieve() call.""" + service = self._make_service() + service.retrieve = MagicMock(return_value=[]) + + service.retrieve_per_subquestion(["Q1"], n_results=42) + + service.retrieve.assert_called_once_with(["Q1"], n_results=42) + + def test_retrieve_per_subquestion_handles_empty_results(self): + """One sub-q returns no results, another returns results.""" + service = self._make_service() + + # First call returns empty, second returns data + service.retrieve = MagicMock( + side_effect=[ + [], + [("chunk B", {"filename": "b.pdf"}, 0.2)], + ] + ) + + results = service.retrieve_per_subquestion( + ["No results Q", "Has results Q"], n_results=5 + ) + + assert len(results) == 2 + + # First sub-question has empty chunks + assert results[0][0] == "No results Q" + assert results[0][1] == [] + + # Second sub-question has chunks + assert results[1][0] == "Has results Q" + assert len(results[1][1]) == 1 + assert results[1][1][0] == ("chunk B", {"filename": "b.pdf"}, 0.2)