diff --git a/backend/app/models/highlight.py b/backend/app/models/highlight.py new file mode 100644 index 0000000..5b98c7f --- /dev/null +++ b/backend/app/models/highlight.py @@ -0,0 +1,38 @@ +from typing import Literal + +from pydantic import BaseModel, Field + + +class ChunkHighlightTarget(BaseModel): + document_id: str + chunk_index: int + sub_question_text: str + sub_question_index: int + + +class HighlightBatchRequest(BaseModel): + targets: list[ChunkHighlightTarget] + + +class RelevantSentence(BaseModel): + sentence_index: int = Field(description="0-based index of the relevant sentence") + reason: str = Field( + description="Brief explanation of why this sentence is relevant", + max_length=80, + ) + + +class ChunkHighlights(BaseModel): + document_id: str + chunk_index: int + relevant_sentences: list[RelevantSentence] = Field(default_factory=list) + + +class HighlightBatchResult(BaseModel): + results: list[ChunkHighlights] + + +class HighlightBatchResponse(BaseModel): + status: Literal["completed", "partial", "failed"] + cached_count: int = 0 + errors: list[str] = Field(default_factory=list) diff --git a/backend/app/test/test_phase5_highlight_models.py b/backend/app/test/test_phase5_highlight_models.py new file mode 100644 index 0000000..7481005 --- /dev/null +++ b/backend/app/test/test_phase5_highlight_models.py @@ -0,0 +1,375 @@ +"""Tests for Phase 5.4 Highlight Pydantic models. + +Validates ChunkHighlightTarget, HighlightBatchRequest, RelevantSentence, +ChunkHighlights, HighlightBatchResult, and HighlightBatchResponse models. +Ensures correct validation, defaults, and serialization. +""" + +import pytest +from pydantic import ValidationError + + +class TestChunkHighlightTarget: + """Tests for ChunkHighlightTarget model.""" + + def test_valid_creation(self): + """Should create a valid ChunkHighlightTarget with all fields.""" + from app.models.highlight import ChunkHighlightTarget + + target = ChunkHighlightTarget( + document_id="doc-123", + chunk_index=5, + sub_question_text="What is the main topic?", + sub_question_index=0, + ) + assert target.document_id == "doc-123" + assert target.chunk_index == 5 + assert target.sub_question_text == "What is the main topic?" + assert target.sub_question_index == 0 + + def test_missing_document_id_rejected(self): + """Missing document_id should raise ValidationError.""" + from app.models.highlight import ChunkHighlightTarget + + with pytest.raises(ValidationError): + ChunkHighlightTarget( + chunk_index=0, + sub_question_text="test", + sub_question_index=0, + ) + + def test_missing_chunk_index_rejected(self): + """Missing chunk_index should raise ValidationError.""" + from app.models.highlight import ChunkHighlightTarget + + with pytest.raises(ValidationError): + ChunkHighlightTarget( + document_id="doc-123", + sub_question_text="test", + sub_question_index=0, + ) + + def test_negative_chunk_index_accepted(self): + """Negative chunk_index should be accepted (no gt constraint).""" + from app.models.highlight import ChunkHighlightTarget + + target = ChunkHighlightTarget( + document_id="doc-123", + chunk_index=-1, + sub_question_text="test", + sub_question_index=0, + ) + assert target.chunk_index == -1 + + +class TestHighlightBatchRequest: + """Tests for HighlightBatchRequest model.""" + + def test_valid_with_multiple_targets(self): + """Should accept a list of ChunkHighlightTarget objects.""" + from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest + + request = HighlightBatchRequest( + targets=[ + ChunkHighlightTarget( + document_id="doc-123", + chunk_index=0, + sub_question_text="Q1", + sub_question_index=0, + ), + ChunkHighlightTarget( + document_id="doc-456", + chunk_index=1, + sub_question_text="Q2", + sub_question_index=1, + ), + ] + ) + assert len(request.targets) == 2 + assert request.targets[0].document_id == "doc-123" + assert request.targets[1].document_id == "doc-456" + + def test_empty_targets_accepted(self): + """Empty targets list should be accepted.""" + from app.models.highlight import HighlightBatchRequest + + request = HighlightBatchRequest(targets=[]) + assert request.targets == [] + + def test_missing_targets_rejected(self): + """Missing targets field should raise ValidationError.""" + from app.models.highlight import HighlightBatchRequest + + with pytest.raises(ValidationError): + HighlightBatchRequest() # type: ignore + + def test_invalid_target_type_rejected(self): + """Non-ChunkHighlightTarget items should raise ValidationError.""" + from app.models.highlight import HighlightBatchRequest + + with pytest.raises(ValidationError): + HighlightBatchRequest(targets=["not a target"]) # type: ignore + + +class TestRelevantSentence: + """Tests for RelevantSentence model.""" + + def test_valid_creation(self): + """Should create a valid RelevantSentence with description fields.""" + from app.models.highlight import RelevantSentence + + rs = RelevantSentence( + sentence_index=3, + reason="Directly answers the sub-question", + ) + assert rs.sentence_index == 3 + assert rs.reason == "Directly answers the sub-question" + + def test_reason_max_length_enforced(self): + """Reason exceeding max_length=80 should raise ValidationError.""" + from app.models.highlight import RelevantSentence + + with pytest.raises(ValidationError, match="reason"): + RelevantSentence( + sentence_index=0, + reason="x" * 81, + ) + + def test_reason_at_max_length_accepted(self): + """Reason exactly at max_length=80 should be accepted.""" + from app.models.highlight import RelevantSentence + + rs = RelevantSentence( + sentence_index=0, + reason="x" * 80, + ) + assert len(rs.reason) == 80 + + def test_missing_sentence_index_rejected(self): + """Missing sentence_index should raise ValidationError.""" + from app.models.highlight import RelevantSentence + + with pytest.raises(ValidationError): + RelevantSentence(reason="test") + + def test_missing_reason_rejected(self): + """Missing reason should raise ValidationError.""" + from app.models.highlight import RelevantSentence + + with pytest.raises(ValidationError): + RelevantSentence(sentence_index=0) + + +class TestChunkHighlights: + """Tests for ChunkHighlights model.""" + + def test_valid_with_sentences(self): + """Should create ChunkHighlights with relevant_sentences.""" + from app.models.highlight import ChunkHighlights, RelevantSentence + + ch = ChunkHighlights( + document_id="doc-123", + chunk_index=0, + relevant_sentences=[ + RelevantSentence(sentence_index=1, reason="Key point"), + RelevantSentence(sentence_index=3, reason="Supports answer"), + ], + ) + assert ch.document_id == "doc-123" + assert ch.chunk_index == 0 + assert len(ch.relevant_sentences) == 2 + assert ch.relevant_sentences[0].sentence_index == 1 + + def test_default_empty_sentences(self): + """Default relevant_sentences should be an empty list.""" + from app.models.highlight import ChunkHighlights + + ch = ChunkHighlights( + document_id="doc-123", + chunk_index=0, + ) + assert ch.relevant_sentences == [] + + def test_explicit_empty_sentences(self): + """Explicitly passing empty list should work.""" + from app.models.highlight import ChunkHighlights + + ch = ChunkHighlights( + document_id="doc-123", + chunk_index=0, + relevant_sentences=[], + ) + assert ch.relevant_sentences == [] + + +class TestHighlightBatchResult: + """Tests for HighlightBatchResult model.""" + + def test_valid_with_results(self): + """Should create HighlightBatchResult with ChunkHighlights list.""" + from app.models.highlight import ( + ChunkHighlights, + HighlightBatchResult, + RelevantSentence, + ) + + result = HighlightBatchResult( + results=[ + ChunkHighlights( + document_id="doc-123", + chunk_index=0, + relevant_sentences=[ + RelevantSentence(sentence_index=0, reason="First") + ], + ), + ] + ) + assert len(result.results) == 1 + assert result.results[0].document_id == "doc-123" + + def test_empty_results_accepted(self): + """Empty results list should be accepted.""" + from app.models.highlight import HighlightBatchResult + + result = HighlightBatchResult(results=[]) + assert result.results == [] + + def test_missing_results_rejected(self): + """Missing results field should raise ValidationError.""" + from app.models.highlight import HighlightBatchResult + + with pytest.raises(ValidationError): + HighlightBatchResult() # type: ignore + + +class TestHighlightBatchResponse: + """Tests for HighlightBatchResponse model.""" + + def test_status_completed(self): + """Should accept 'completed' status.""" + from app.models.highlight import HighlightBatchResponse + + resp = HighlightBatchResponse(status="completed") + assert resp.status == "completed" + assert resp.cached_count == 0 + assert resp.errors == [] + + def test_status_partial(self): + """Should accept 'partial' status.""" + from app.models.highlight import HighlightBatchResponse + + resp = HighlightBatchResponse(status="partial", cached_count=2) + assert resp.status == "partial" + assert resp.cached_count == 2 + + def test_status_failed(self): + """Should accept 'failed' status with errors.""" + from app.models.highlight import HighlightBatchResponse + + resp = HighlightBatchResponse( + status="failed", + errors=["document not found", "chunk out of range"], + ) + assert resp.status == "failed" + assert len(resp.errors) == 2 + + def test_invalid_status_rejected(self): + """Status not in Literal should raise ValidationError.""" + from app.models.highlight import HighlightBatchResponse + + with pytest.raises(ValidationError): + HighlightBatchResponse(status="unknown") # type: ignore + + def test_default_cached_count(self): + """Default cached_count should be 0.""" + from app.models.highlight import HighlightBatchResponse + + resp = HighlightBatchResponse(status="completed") + assert resp.cached_count == 0 + + def test_default_errors(self): + """Default errors should be an empty list.""" + from app.models.highlight import HighlightBatchResponse + + resp = HighlightBatchResponse(status="completed") + assert resp.errors == [] + + +class TestSerialization: + """Tests for model_dump() serialization.""" + + def test_chunk_highlight_target_dump(self): + """model_dump() should produce expected dict for ChunkHighlightTarget.""" + from app.models.highlight import ChunkHighlightTarget + + target = ChunkHighlightTarget( + document_id="doc-123", + chunk_index=5, + sub_question_text="What is the main topic?", + sub_question_index=0, + ) + data = target.model_dump() + assert data == { + "document_id": "doc-123", + "chunk_index": 5, + "sub_question_text": "What is the main topic?", + "sub_question_index": 0, + } + + def test_highlight_batch_request_dump(self): + """model_dump() should produce expected nested dict.""" + from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest + + request = HighlightBatchRequest( + targets=[ + ChunkHighlightTarget( + document_id="doc-123", + chunk_index=0, + sub_question_text="Q1", + sub_question_index=0, + ), + ] + ) + data = request.model_dump() + assert data == { + "targets": [ + { + "document_id": "doc-123", + "chunk_index": 0, + "sub_question_text": "Q1", + "sub_question_index": 0, + }, + ] + } + + def test_chunk_highlights_dump(self): + """model_dump() should include default empty list for relevant_sentences.""" + from app.models.highlight import ChunkHighlights + + ch = ChunkHighlights(document_id="doc-123", chunk_index=0) + data = ch.model_dump() + assert data == { + "document_id": "doc-123", + "chunk_index": 0, + "relevant_sentences": [], + } + + def test_highlight_batch_response_dump(self): + """model_dump() should produce expected dict with defaults.""" + from app.models.highlight import HighlightBatchResponse + + resp = HighlightBatchResponse(status="partial", cached_count=3) + data = resp.model_dump() + assert data == { + "status": "partial", + "cached_count": 3, + "errors": [], + } + + def test_relevant_sentence_reason_max_length(self): + """model_dump() should preserve reason at max length.""" + from app.models.highlight import RelevantSentence + + rs = RelevantSentence(sentence_index=0, reason="x" * 80) + data = rs.model_dump() + assert data["reason"] == "x" * 80 diff --git a/backend/app/test/test_phase5_sentence_splitter.py b/backend/app/test/test_phase5_sentence_splitter.py new file mode 100644 index 0000000..530b69d --- /dev/null +++ b/backend/app/test/test_phase5_sentence_splitter.py @@ -0,0 +1,101 @@ +"""Phase 5 tests: Sentence splitter utility. + +Tests for backend/app/utils/sentence_splitter.py covering: +- English sentence boundaries (., !, ?) +- Chinese sentence boundaries (。, !, ?) +- Mixed English/Chinese text +- Empty and whitespace-only input +- Single sentence without trailing punctuation +- Bullet list items +- Multiple newlines as sentence boundaries +- Trailing/leading whitespace handling +""" + +import importlib.util +from pathlib import Path + +# Dynamically load the sentence_splitter module to avoid package-path import issues. +MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py" +spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH)) +sentence_splitter = importlib.util.module_from_spec(spec) # type: ignore +assert spec is not None and spec.loader is not None +spec.loader.exec_module(sentence_splitter) # type: ignore +split_sentences = getattr(sentence_splitter, "split_sentences") + + +def test_empty_string_returns_empty_list(): + assert split_sentences("") == [] + + +def test_whitespace_only_returns_empty_list(): + assert split_sentences(" ") == [] + assert split_sentences("\n\t ") == [] + + +def test_single_sentence_no_punctuation(): + text = "Hello world" + result = split_sentences(text) + assert result == ["Hello world"] + + +def test_english_sentences(): + text = "First sentence. Second sentence! Third sentence?" + result = split_sentences(text) + assert result == ["First sentence.", "Second sentence!", "Third sentence?"] + + +def test_chinese_sentences(): + text = "第一句。第二句!第三句?" + result = split_sentences(text) + assert result == ["第一句。", "第二句!", "第三句?"] + + +def test_mixed_english_chinese(): + text = "The date is 2024年1月1日。The Contractor shall start work on Monday." + result = split_sentences(text) + assert result == [ + "The date is 2024年1月1日。", + "The Contractor shall start work on Monday.", + ] + + +def test_bullet_list_items_with_periods(): + text = "- Item one. - Item two. - Item three." + result = split_sentences(text) + assert result == ["- Item one.", "- Item two.", "- Item three."] + + +def test_multiple_newlines_as_boundaries(): + text = "First paragraph\n\nSecond paragraph" + result = split_sentences(text) + assert result == ["First paragraph", "Second paragraph"] + + +def test_single_newline_does_not_split(): + text = "Line one\nLine two" + result = split_sentences(text) + assert result == ["Line one\nLine two"] + + +def test_trailing_whitespace_handled(): + text = "Hello world. \n\n Another sentence. " + result = split_sentences(text) + assert result == ["Hello world.", "Another sentence."] + + +def test_leading_whitespace_handled(): + text = " Hello world. Another sentence." + result = split_sentences(text) + assert result == ["Hello world.", "Another sentence."] + + +def test_exclamation_and_question_marks(): + text = "What is this? It is amazing! Really." + result = split_sentences(text) + assert result == ["What is this?", "It is amazing!", "Really."] + + +def test_chinese_mixed_punctuation(): + text = "你好,世界。How are you? 我很好!" + result = split_sentences(text) + assert result == ["你好,世界。", "How are you?", "我很好!"] diff --git a/backend/app/utils/sentence_splitter.py b/backend/app/utils/sentence_splitter.py new file mode 100644 index 0000000..8f8079a --- /dev/null +++ b/backend/app/utils/sentence_splitter.py @@ -0,0 +1,8 @@ +import re + + +def split_sentences(text: str) -> list[str]: + if not text or not text.strip(): + return [] + raw = re.split(r"(?<=[.!?。!?])\s*|(?<=\n)\s+", text) + return [s.strip() for s in raw if s.strip()]