fix(relevance): tolerate LLM score count mismatches via padding instead of discarding

The per-sub-question filter was all-or-nothing: if the LLM returned 9 scores for 10 chunks (common with qwen3.5-35b), every chunk was discarded and the user got 'no relevant information found'. Now: fewer scores → pad with 0.0; more scores → truncate. Changed from error→warning since this is recoverable. Also improve LTT page UI: sources collapsed by default in per-sub-q sections, and the 'Your question' text now shows the full question instead of being truncated. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-27 14:31:18 +08:00 · 2026-04-27 14:31:18 +08:00 · a7a22f1494
parent 2656f9ca08
commit a7a22f1494
5 changed files with 30 additions and 16 deletions
--- a/backend/app/services/relevance_filter.py
+++ b/backend/app/services/relevance_filter.py
@ -212,15 +212,23 @@ class RelevanceFilter:
            key = str(idx)
            if len(sub_chunks[idx]) == 0:
                continue
-            if key not in score_map or len(score_map[key]) != len(sub_chunks[idx]):
-                logger.error(
-                    "RelevanceFilter per-subq score count mismatch for sub-q %d: "
-                    "expected %d scores, got %d",
-                    idx, len(sub_chunks[idx]),
-                    len(score_map.get(key, [])),
-                )
+            if key not in score_map:
+                logger.error("RelevanceFilter per-subq: no scores for sub-q %d", idx)
                return [], prompt

+            expected = len(sub_chunks[idx])
+            actual = len(score_map[key])
+            if actual != expected:
+                logger.warning(
+                    "RelevanceFilter per-subq score count mismatch for sub-q %d: "
+                    "expected %d scores, got %d — padding with 0.0",
+                    idx, expected, actual,
+                )
+                if actual < expected:
+                    score_map[key].extend([0.0] * (expected - actual))
+                else:
+                    score_map[key] = score_map[key][:expected]
+
        filtered_results: List[Tuple[str, List[Tuple[str, Dict]]]] = []
        for idx, (sq, chunks) in enumerate(zip(sub_questions, sub_chunks)):
            scores = score_map.get(str(idx), [])
--- a/backend/app/test/test_phase4_relevance_filter_per_subq.py
+++ b/backend/app/test/test_phase4_relevance_filter_per_subq.py
@ -170,7 +170,7 @@ async def test_filter_per_subq_llm_returns_invalid_json(tmp_path):
 # Test: score count mismatch
 # ---------------------------------------------------------------------------
 async def test_filter_per_subq_score_count_mismatch(tmp_path):
-    """Sub-q 0 has 2 chunks but LLM returns only 1 score → returns ([], prompt)."""
+    """Sub-q 0 has 2 chunks but LLM returns only 1 score — pads with 0.0, keeps high-scored chunk."""
    from app.services.relevance_filter import RelevanceFilter

    llm = _MockLLM(response='{"0": [8.5]}')
@ -183,7 +183,12 @@ async def test_filter_per_subq_score_count_mismatch(tmp_path):
        threshold=7.0,
    )

-    assert results == []
+    assert len(results) == 1
+    sq, chunks = results[0]
+    assert sq == "What is A?"
+    assert len(chunks) == 1
+    assert chunks[0][0] == "chunk A1"
+    assert chunks[0][1]["relevance_score"] == 8.5
    assert prompt != ""


--- a/frontend/src/components/QueryInput.tsx
+++ b/frontend/src/components/QueryInput.tsx
@ -55,7 +55,7 @@ export const QueryInput: React.FC<QueryInputProps> = ({ onSubmit, isLoading }) =
          {isLoading ? 'Processing...' : 'Submit'}
        </button>
        {submittedQuestion && (
-          <p data-testid="submitted-question" className="text-sm text-gray-500 italic truncate">
+          <p data-testid="submitted-question" className="text-sm text-gray-500 italic break-words">
            Your question: &ldquo;{submittedQuestion}&rdquo;
          </p>
        )}
--- a/frontend/src/components/ResponsePanel.tsx
+++ b/frontend/src/components/ResponsePanel.tsx
@ -74,7 +74,7 @@ function SubQuestionSection({
  subQuestion: SubQuestionSources
  answerSection: string
 }) {
-  const [expanded, setExpanded] = useState(true)
+  const [expanded, setExpanded] = useState(false)
  const processedAnswer = processCitationsForSubq(answerSection, [subQuestion], 0)

  return (
--- a/frontend/src/test/components/test_phase4_response_panel.test.tsx
+++ b/frontend/src/test/components/test_phase4_response_panel.test.tsx
@ -80,13 +80,14 @@ describe('ResponsePanel — per-sub-question rendering (Phase 4)', () => {
    const toggles = screen.getAllByTestId('sources-toggle')
    expect(toggles).toHaveLength(2)
    expect(toggles[0]).toHaveTextContent('Sources (1)')
-    expect(screen.getAllByTestId('sources-container')).toHaveLength(2)

-    fireEvent.click(toggles[1])
-    const sourceCards = screen.getAllByTestId('sources-container')
-    expect(sourceCards).toHaveLength(1)
+    // Default: both collapsed (hidden)
+    expect(screen.queryAllByTestId('sources-container')).toHaveLength(0)
+
+    // Click first toggle to expand
+    fireEvent.click(toggles[0])
+    expect(screen.getAllByTestId('sources-container')).toHaveLength(1)
    expect(screen.getByText(/Page 3/)).toBeInTheDocument()
-    expect(screen.queryByText(/Page 7/)).not.toBeInTheDocument()
  })

  it('falls back to flat rendering when subQuestionSources is null', () => {