fix(relevance): tolerate LLM score count mismatches via padding instead of discarding

The per-sub-question filter was all-or-nothing: if the LLM returned
9 scores for 10 chunks (common with qwen3.5-35b), every chunk was
discarded and the user got 'no relevant information found'.

Now: fewer scores → pad with 0.0; more scores → truncate. Changed
from error→warning since this is recoverable.

Also improve LTT page UI: sources collapsed by default in per-sub-q
sections, and the 'Your question' text now shows the full question
instead of being truncated.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-27 14:31:18 +08:00
parent 2656f9ca08
commit a7a22f1494
5 changed files with 30 additions and 16 deletions

View File

@ -212,15 +212,23 @@ class RelevanceFilter:
key = str(idx) key = str(idx)
if len(sub_chunks[idx]) == 0: if len(sub_chunks[idx]) == 0:
continue continue
if key not in score_map or len(score_map[key]) != len(sub_chunks[idx]): if key not in score_map:
logger.error( logger.error("RelevanceFilter per-subq: no scores for sub-q %d", idx)
"RelevanceFilter per-subq score count mismatch for sub-q %d: "
"expected %d scores, got %d",
idx, len(sub_chunks[idx]),
len(score_map.get(key, [])),
)
return [], prompt return [], prompt
expected = len(sub_chunks[idx])
actual = len(score_map[key])
if actual != expected:
logger.warning(
"RelevanceFilter per-subq score count mismatch for sub-q %d: "
"expected %d scores, got %d — padding with 0.0",
idx, expected, actual,
)
if actual < expected:
score_map[key].extend([0.0] * (expected - actual))
else:
score_map[key] = score_map[key][:expected]
filtered_results: List[Tuple[str, List[Tuple[str, Dict]]]] = [] filtered_results: List[Tuple[str, List[Tuple[str, Dict]]]] = []
for idx, (sq, chunks) in enumerate(zip(sub_questions, sub_chunks)): for idx, (sq, chunks) in enumerate(zip(sub_questions, sub_chunks)):
scores = score_map.get(str(idx), []) scores = score_map.get(str(idx), [])

View File

@ -170,7 +170,7 @@ async def test_filter_per_subq_llm_returns_invalid_json(tmp_path):
# Test: score count mismatch # Test: score count mismatch
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def test_filter_per_subq_score_count_mismatch(tmp_path): async def test_filter_per_subq_score_count_mismatch(tmp_path):
"""Sub-q 0 has 2 chunks but LLM returns only 1 score → returns ([], prompt).""" """Sub-q 0 has 2 chunks but LLM returns only 1 score — pads with 0.0, keeps high-scored chunk."""
from app.services.relevance_filter import RelevanceFilter from app.services.relevance_filter import RelevanceFilter
llm = _MockLLM(response='{"0": [8.5]}') llm = _MockLLM(response='{"0": [8.5]}')
@ -183,7 +183,12 @@ async def test_filter_per_subq_score_count_mismatch(tmp_path):
threshold=7.0, threshold=7.0,
) )
assert results == [] assert len(results) == 1
sq, chunks = results[0]
assert sq == "What is A?"
assert len(chunks) == 1
assert chunks[0][0] == "chunk A1"
assert chunks[0][1]["relevance_score"] == 8.5
assert prompt != "" assert prompt != ""

View File

@ -55,7 +55,7 @@ export const QueryInput: React.FC<QueryInputProps> = ({ onSubmit, isLoading }) =
{isLoading ? 'Processing...' : 'Submit'} {isLoading ? 'Processing...' : 'Submit'}
</button> </button>
{submittedQuestion && ( {submittedQuestion && (
<p data-testid="submitted-question" className="text-sm text-gray-500 italic truncate"> <p data-testid="submitted-question" className="text-sm text-gray-500 italic break-words">
Your question: &ldquo;{submittedQuestion}&rdquo; Your question: &ldquo;{submittedQuestion}&rdquo;
</p> </p>
)} )}

View File

@ -74,7 +74,7 @@ function SubQuestionSection({
subQuestion: SubQuestionSources subQuestion: SubQuestionSources
answerSection: string answerSection: string
}) { }) {
const [expanded, setExpanded] = useState(true) const [expanded, setExpanded] = useState(false)
const processedAnswer = processCitationsForSubq(answerSection, [subQuestion], 0) const processedAnswer = processCitationsForSubq(answerSection, [subQuestion], 0)
return ( return (

View File

@ -80,13 +80,14 @@ describe('ResponsePanel — per-sub-question rendering (Phase 4)', () => {
const toggles = screen.getAllByTestId('sources-toggle') const toggles = screen.getAllByTestId('sources-toggle')
expect(toggles).toHaveLength(2) expect(toggles).toHaveLength(2)
expect(toggles[0]).toHaveTextContent('Sources (1)') expect(toggles[0]).toHaveTextContent('Sources (1)')
expect(screen.getAllByTestId('sources-container')).toHaveLength(2)
fireEvent.click(toggles[1]) // Default: both collapsed (hidden)
const sourceCards = screen.getAllByTestId('sources-container') expect(screen.queryAllByTestId('sources-container')).toHaveLength(0)
expect(sourceCards).toHaveLength(1)
// Click first toggle to expand
fireEvent.click(toggles[0])
expect(screen.getAllByTestId('sources-container')).toHaveLength(1)
expect(screen.getByText(/Page 3/)).toBeInTheDocument() expect(screen.getByText(/Page 3/)).toBeInTheDocument()
expect(screen.queryByText(/Page 7/)).not.toBeInTheDocument()
}) })
it('falls back to flat rendering when subQuestionSources is null', () => { it('falls back to flat rendering when subQuestionSources is null', () => {