feat: add sentence splitter and highlight data models (Phase 5.4.1-5.4.2)

- sentence_splitter.py: regex-based sentence splitting for English + Chinese punctuation - highlight.py: 6 Pydantic models (ChunkHighlightTarget, HighlightBatchRequest, RelevantSentence, ChunkHighlights, HighlightBatchResult, HighlightBatchResponse) - 43 tests: 13 sentence splitter + 30 model validation
2026-04-29 09:26:06 +08:00 · 2026-04-29 09:26:06 +08:00 · b11d31e2d1
parent ec3b5a4ae1
commit b11d31e2d1
4 changed files with 522 additions and 0 deletions
--- a/backend/app/models/highlight.py
+++ b/backend/app/models/highlight.py
@ -0,0 +1,38 @@
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class ChunkHighlightTarget(BaseModel):
+    document_id: str
+    chunk_index: int
+    sub_question_text: str
+    sub_question_index: int
+
+
+class HighlightBatchRequest(BaseModel):
+    targets: list[ChunkHighlightTarget]
+
+
+class RelevantSentence(BaseModel):
+    sentence_index: int = Field(description="0-based index of the relevant sentence")
+    reason: str = Field(
+        description="Brief explanation of why this sentence is relevant",
+        max_length=80,
+    )
+
+
+class ChunkHighlights(BaseModel):
+    document_id: str
+    chunk_index: int
+    relevant_sentences: list[RelevantSentence] = Field(default_factory=list)
+
+
+class HighlightBatchResult(BaseModel):
+    results: list[ChunkHighlights]
+
+
+class HighlightBatchResponse(BaseModel):
+    status: Literal["completed", "partial", "failed"]
+    cached_count: int = 0
+    errors: list[str] = Field(default_factory=list)
--- a/backend/app/test/test_phase5_highlight_models.py
+++ b/backend/app/test/test_phase5_highlight_models.py
@ -0,0 +1,375 @@
+"""Tests for Phase 5.4 Highlight Pydantic models.
+
+Validates ChunkHighlightTarget, HighlightBatchRequest, RelevantSentence,
+ChunkHighlights, HighlightBatchResult, and HighlightBatchResponse models.
+Ensures correct validation, defaults, and serialization.
+"""
+
+import pytest
+from pydantic import ValidationError
+
+
+class TestChunkHighlightTarget:
+    """Tests for ChunkHighlightTarget model."""
+
+    def test_valid_creation(self):
+        """Should create a valid ChunkHighlightTarget with all fields."""
+        from app.models.highlight import ChunkHighlightTarget
+
+        target = ChunkHighlightTarget(
+            document_id="doc-123",
+            chunk_index=5,
+            sub_question_text="What is the main topic?",
+            sub_question_index=0,
+        )
+        assert target.document_id == "doc-123"
+        assert target.chunk_index == 5
+        assert target.sub_question_text == "What is the main topic?"
+        assert target.sub_question_index == 0
+
+    def test_missing_document_id_rejected(self):
+        """Missing document_id should raise ValidationError."""
+        from app.models.highlight import ChunkHighlightTarget
+
+        with pytest.raises(ValidationError):
+            ChunkHighlightTarget(
+                chunk_index=0,
+                sub_question_text="test",
+                sub_question_index=0,
+            )
+
+    def test_missing_chunk_index_rejected(self):
+        """Missing chunk_index should raise ValidationError."""
+        from app.models.highlight import ChunkHighlightTarget
+
+        with pytest.raises(ValidationError):
+            ChunkHighlightTarget(
+                document_id="doc-123",
+                sub_question_text="test",
+                sub_question_index=0,
+            )
+
+    def test_negative_chunk_index_accepted(self):
+        """Negative chunk_index should be accepted (no gt constraint)."""
+        from app.models.highlight import ChunkHighlightTarget
+
+        target = ChunkHighlightTarget(
+            document_id="doc-123",
+            chunk_index=-1,
+            sub_question_text="test",
+            sub_question_index=0,
+        )
+        assert target.chunk_index == -1
+
+
+class TestHighlightBatchRequest:
+    """Tests for HighlightBatchRequest model."""
+
+    def test_valid_with_multiple_targets(self):
+        """Should accept a list of ChunkHighlightTarget objects."""
+        from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest
+
+        request = HighlightBatchRequest(
+            targets=[
+                ChunkHighlightTarget(
+                    document_id="doc-123",
+                    chunk_index=0,
+                    sub_question_text="Q1",
+                    sub_question_index=0,
+                ),
+                ChunkHighlightTarget(
+                    document_id="doc-456",
+                    chunk_index=1,
+                    sub_question_text="Q2",
+                    sub_question_index=1,
+                ),
+            ]
+        )
+        assert len(request.targets) == 2
+        assert request.targets[0].document_id == "doc-123"
+        assert request.targets[1].document_id == "doc-456"
+
+    def test_empty_targets_accepted(self):
+        """Empty targets list should be accepted."""
+        from app.models.highlight import HighlightBatchRequest
+
+        request = HighlightBatchRequest(targets=[])
+        assert request.targets == []
+
+    def test_missing_targets_rejected(self):
+        """Missing targets field should raise ValidationError."""
+        from app.models.highlight import HighlightBatchRequest
+
+        with pytest.raises(ValidationError):
+            HighlightBatchRequest()  # type: ignore
+
+    def test_invalid_target_type_rejected(self):
+        """Non-ChunkHighlightTarget items should raise ValidationError."""
+        from app.models.highlight import HighlightBatchRequest
+
+        with pytest.raises(ValidationError):
+            HighlightBatchRequest(targets=["not a target"])  # type: ignore
+
+
+class TestRelevantSentence:
+    """Tests for RelevantSentence model."""
+
+    def test_valid_creation(self):
+        """Should create a valid RelevantSentence with description fields."""
+        from app.models.highlight import RelevantSentence
+
+        rs = RelevantSentence(
+            sentence_index=3,
+            reason="Directly answers the sub-question",
+        )
+        assert rs.sentence_index == 3
+        assert rs.reason == "Directly answers the sub-question"
+
+    def test_reason_max_length_enforced(self):
+        """Reason exceeding max_length=80 should raise ValidationError."""
+        from app.models.highlight import RelevantSentence
+
+        with pytest.raises(ValidationError, match="reason"):
+            RelevantSentence(
+                sentence_index=0,
+                reason="x" * 81,
+            )
+
+    def test_reason_at_max_length_accepted(self):
+        """Reason exactly at max_length=80 should be accepted."""
+        from app.models.highlight import RelevantSentence
+
+        rs = RelevantSentence(
+            sentence_index=0,
+            reason="x" * 80,
+        )
+        assert len(rs.reason) == 80
+
+    def test_missing_sentence_index_rejected(self):
+        """Missing sentence_index should raise ValidationError."""
+        from app.models.highlight import RelevantSentence
+
+        with pytest.raises(ValidationError):
+            RelevantSentence(reason="test")
+
+    def test_missing_reason_rejected(self):
+        """Missing reason should raise ValidationError."""
+        from app.models.highlight import RelevantSentence
+
+        with pytest.raises(ValidationError):
+            RelevantSentence(sentence_index=0)
+
+
+class TestChunkHighlights:
+    """Tests for ChunkHighlights model."""
+
+    def test_valid_with_sentences(self):
+        """Should create ChunkHighlights with relevant_sentences."""
+        from app.models.highlight import ChunkHighlights, RelevantSentence
+
+        ch = ChunkHighlights(
+            document_id="doc-123",
+            chunk_index=0,
+            relevant_sentences=[
+                RelevantSentence(sentence_index=1, reason="Key point"),
+                RelevantSentence(sentence_index=3, reason="Supports answer"),
+            ],
+        )
+        assert ch.document_id == "doc-123"
+        assert ch.chunk_index == 0
+        assert len(ch.relevant_sentences) == 2
+        assert ch.relevant_sentences[0].sentence_index == 1
+
+    def test_default_empty_sentences(self):
+        """Default relevant_sentences should be an empty list."""
+        from app.models.highlight import ChunkHighlights
+
+        ch = ChunkHighlights(
+            document_id="doc-123",
+            chunk_index=0,
+        )
+        assert ch.relevant_sentences == []
+
+    def test_explicit_empty_sentences(self):
+        """Explicitly passing empty list should work."""
+        from app.models.highlight import ChunkHighlights
+
+        ch = ChunkHighlights(
+            document_id="doc-123",
+            chunk_index=0,
+            relevant_sentences=[],
+        )
+        assert ch.relevant_sentences == []
+
+
+class TestHighlightBatchResult:
+    """Tests for HighlightBatchResult model."""
+
+    def test_valid_with_results(self):
+        """Should create HighlightBatchResult with ChunkHighlights list."""
+        from app.models.highlight import (
+            ChunkHighlights,
+            HighlightBatchResult,
+            RelevantSentence,
+        )
+
+        result = HighlightBatchResult(
+            results=[
+                ChunkHighlights(
+                    document_id="doc-123",
+                    chunk_index=0,
+                    relevant_sentences=[
+                        RelevantSentence(sentence_index=0, reason="First")
+                    ],
+                ),
+            ]
+        )
+        assert len(result.results) == 1
+        assert result.results[0].document_id == "doc-123"
+
+    def test_empty_results_accepted(self):
+        """Empty results list should be accepted."""
+        from app.models.highlight import HighlightBatchResult
+
+        result = HighlightBatchResult(results=[])
+        assert result.results == []
+
+    def test_missing_results_rejected(self):
+        """Missing results field should raise ValidationError."""
+        from app.models.highlight import HighlightBatchResult
+
+        with pytest.raises(ValidationError):
+            HighlightBatchResult()  # type: ignore
+
+
+class TestHighlightBatchResponse:
+    """Tests for HighlightBatchResponse model."""
+
+    def test_status_completed(self):
+        """Should accept 'completed' status."""
+        from app.models.highlight import HighlightBatchResponse
+
+        resp = HighlightBatchResponse(status="completed")
+        assert resp.status == "completed"
+        assert resp.cached_count == 0
+        assert resp.errors == []
+
+    def test_status_partial(self):
+        """Should accept 'partial' status."""
+        from app.models.highlight import HighlightBatchResponse
+
+        resp = HighlightBatchResponse(status="partial", cached_count=2)
+        assert resp.status == "partial"
+        assert resp.cached_count == 2
+
+    def test_status_failed(self):
+        """Should accept 'failed' status with errors."""
+        from app.models.highlight import HighlightBatchResponse
+
+        resp = HighlightBatchResponse(
+            status="failed",
+            errors=["document not found", "chunk out of range"],
+        )
+        assert resp.status == "failed"
+        assert len(resp.errors) == 2
+
+    def test_invalid_status_rejected(self):
+        """Status not in Literal should raise ValidationError."""
+        from app.models.highlight import HighlightBatchResponse
+
+        with pytest.raises(ValidationError):
+            HighlightBatchResponse(status="unknown")  # type: ignore
+
+    def test_default_cached_count(self):
+        """Default cached_count should be 0."""
+        from app.models.highlight import HighlightBatchResponse
+
+        resp = HighlightBatchResponse(status="completed")
+        assert resp.cached_count == 0
+
+    def test_default_errors(self):
+        """Default errors should be an empty list."""
+        from app.models.highlight import HighlightBatchResponse
+
+        resp = HighlightBatchResponse(status="completed")
+        assert resp.errors == []
+
+
+class TestSerialization:
+    """Tests for model_dump() serialization."""
+
+    def test_chunk_highlight_target_dump(self):
+        """model_dump() should produce expected dict for ChunkHighlightTarget."""
+        from app.models.highlight import ChunkHighlightTarget
+
+        target = ChunkHighlightTarget(
+            document_id="doc-123",
+            chunk_index=5,
+            sub_question_text="What is the main topic?",
+            sub_question_index=0,
+        )
+        data = target.model_dump()
+        assert data == {
+            "document_id": "doc-123",
+            "chunk_index": 5,
+            "sub_question_text": "What is the main topic?",
+            "sub_question_index": 0,
+        }
+
+    def test_highlight_batch_request_dump(self):
+        """model_dump() should produce expected nested dict."""
+        from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest
+
+        request = HighlightBatchRequest(
+            targets=[
+                ChunkHighlightTarget(
+                    document_id="doc-123",
+                    chunk_index=0,
+                    sub_question_text="Q1",
+                    sub_question_index=0,
+                ),
+            ]
+        )
+        data = request.model_dump()
+        assert data == {
+            "targets": [
+                {
+                    "document_id": "doc-123",
+                    "chunk_index": 0,
+                    "sub_question_text": "Q1",
+                    "sub_question_index": 0,
+                },
+            ]
+        }
+
+    def test_chunk_highlights_dump(self):
+        """model_dump() should include default empty list for relevant_sentences."""
+        from app.models.highlight import ChunkHighlights
+
+        ch = ChunkHighlights(document_id="doc-123", chunk_index=0)
+        data = ch.model_dump()
+        assert data == {
+            "document_id": "doc-123",
+            "chunk_index": 0,
+            "relevant_sentences": [],
+        }
+
+    def test_highlight_batch_response_dump(self):
+        """model_dump() should produce expected dict with defaults."""
+        from app.models.highlight import HighlightBatchResponse
+
+        resp = HighlightBatchResponse(status="partial", cached_count=3)
+        data = resp.model_dump()
+        assert data == {
+            "status": "partial",
+            "cached_count": 3,
+            "errors": [],
+        }
+
+    def test_relevant_sentence_reason_max_length(self):
+        """model_dump() should preserve reason at max length."""
+        from app.models.highlight import RelevantSentence
+
+        rs = RelevantSentence(sentence_index=0, reason="x" * 80)
+        data = rs.model_dump()
+        assert data["reason"] == "x" * 80
--- a/backend/app/test/test_phase5_sentence_splitter.py
+++ b/backend/app/test/test_phase5_sentence_splitter.py
@ -0,0 +1,101 @@
+"""Phase 5 tests: Sentence splitter utility.
+
+Tests for backend/app/utils/sentence_splitter.py covering:
+- English sentence boundaries (., !, ?)
+- Chinese sentence boundaries (。, ！, ？)
+- Mixed English/Chinese text
+- Empty and whitespace-only input
+- Single sentence without trailing punctuation
+- Bullet list items
+- Multiple newlines as sentence boundaries
+- Trailing/leading whitespace handling
+"""
+
+import importlib.util
+from pathlib import Path
+
+# Dynamically load the sentence_splitter module to avoid package-path import issues.
+MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py"
+spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH))
+sentence_splitter = importlib.util.module_from_spec(spec)  # type: ignore
+assert spec is not None and spec.loader is not None
+spec.loader.exec_module(sentence_splitter)  # type: ignore
+split_sentences = getattr(sentence_splitter, "split_sentences")
+
+
+def test_empty_string_returns_empty_list():
+    assert split_sentences("") == []
+
+
+def test_whitespace_only_returns_empty_list():
+    assert split_sentences("   ") == []
+    assert split_sentences("\n\t  ") == []
+
+
+def test_single_sentence_no_punctuation():
+    text = "Hello world"
+    result = split_sentences(text)
+    assert result == ["Hello world"]
+
+
+def test_english_sentences():
+    text = "First sentence. Second sentence! Third sentence?"
+    result = split_sentences(text)
+    assert result == ["First sentence.", "Second sentence!", "Third sentence?"]
+
+
+def test_chinese_sentences():
+    text = "第一句。第二句！第三句？"
+    result = split_sentences(text)
+    assert result == ["第一句。", "第二句！", "第三句？"]
+
+
+def test_mixed_english_chinese():
+    text = "The date is 2024年1月1日。The Contractor shall start work on Monday."
+    result = split_sentences(text)
+    assert result == [
+        "The date is 2024年1月1日。",
+        "The Contractor shall start work on Monday.",
+    ]
+
+
+def test_bullet_list_items_with_periods():
+    text = "- Item one. - Item two. - Item three."
+    result = split_sentences(text)
+    assert result == ["- Item one.", "- Item two.", "- Item three."]
+
+
+def test_multiple_newlines_as_boundaries():
+    text = "First paragraph\n\nSecond paragraph"
+    result = split_sentences(text)
+    assert result == ["First paragraph", "Second paragraph"]
+
+
+def test_single_newline_does_not_split():
+    text = "Line one\nLine two"
+    result = split_sentences(text)
+    assert result == ["Line one\nLine two"]
+
+
+def test_trailing_whitespace_handled():
+    text = "Hello world.   \n\n  Another sentence.  "
+    result = split_sentences(text)
+    assert result == ["Hello world.", "Another sentence."]
+
+
+def test_leading_whitespace_handled():
+    text = "   Hello world. Another sentence."
+    result = split_sentences(text)
+    assert result == ["Hello world.", "Another sentence."]
+
+
+def test_exclamation_and_question_marks():
+    text = "What is this? It is amazing! Really."
+    result = split_sentences(text)
+    assert result == ["What is this?", "It is amazing!", "Really."]
+
+
+def test_chinese_mixed_punctuation():
+    text = "你好，世界。How are you? 我很好！"
+    result = split_sentences(text)
+    assert result == ["你好，世界。", "How are you?", "我很好！"]
--- a/backend/app/utils/sentence_splitter.py
+++ b/backend/app/utils/sentence_splitter.py
@ -0,0 +1,8 @@
+import re
+
+
+def split_sentences(text: str) -> list[str]:
+    if not text or not text.strip():
+        return []
+    raw = re.split(r"(?<=[.!?。！？])\s*|(?<=\n)\s+", text)
+    return [s.strip() for s in raw if s.strip()]