feat: add sentence splitter and highlight data models (Phase 5.4.1-5.4.2)

- sentence_splitter.py: regex-based sentence splitting for English + Chinese punctuation
- highlight.py: 6 Pydantic models (ChunkHighlightTarget, HighlightBatchRequest,
  RelevantSentence, ChunkHighlights, HighlightBatchResult, HighlightBatchResponse)
- 43 tests: 13 sentence splitter + 30 model validation
This commit is contained in:
Woody 2026-04-29 09:26:06 +08:00
parent ec3b5a4ae1
commit b11d31e2d1
4 changed files with 522 additions and 0 deletions

View File

@ -0,0 +1,38 @@
from typing import Literal
from pydantic import BaseModel, Field
class ChunkHighlightTarget(BaseModel):
document_id: str
chunk_index: int
sub_question_text: str
sub_question_index: int
class HighlightBatchRequest(BaseModel):
targets: list[ChunkHighlightTarget]
class RelevantSentence(BaseModel):
sentence_index: int = Field(description="0-based index of the relevant sentence")
reason: str = Field(
description="Brief explanation of why this sentence is relevant",
max_length=80,
)
class ChunkHighlights(BaseModel):
document_id: str
chunk_index: int
relevant_sentences: list[RelevantSentence] = Field(default_factory=list)
class HighlightBatchResult(BaseModel):
results: list[ChunkHighlights]
class HighlightBatchResponse(BaseModel):
status: Literal["completed", "partial", "failed"]
cached_count: int = 0
errors: list[str] = Field(default_factory=list)

View File

@ -0,0 +1,375 @@
"""Tests for Phase 5.4 Highlight Pydantic models.
Validates ChunkHighlightTarget, HighlightBatchRequest, RelevantSentence,
ChunkHighlights, HighlightBatchResult, and HighlightBatchResponse models.
Ensures correct validation, defaults, and serialization.
"""
import pytest
from pydantic import ValidationError
class TestChunkHighlightTarget:
"""Tests for ChunkHighlightTarget model."""
def test_valid_creation(self):
"""Should create a valid ChunkHighlightTarget with all fields."""
from app.models.highlight import ChunkHighlightTarget
target = ChunkHighlightTarget(
document_id="doc-123",
chunk_index=5,
sub_question_text="What is the main topic?",
sub_question_index=0,
)
assert target.document_id == "doc-123"
assert target.chunk_index == 5
assert target.sub_question_text == "What is the main topic?"
assert target.sub_question_index == 0
def test_missing_document_id_rejected(self):
"""Missing document_id should raise ValidationError."""
from app.models.highlight import ChunkHighlightTarget
with pytest.raises(ValidationError):
ChunkHighlightTarget(
chunk_index=0,
sub_question_text="test",
sub_question_index=0,
)
def test_missing_chunk_index_rejected(self):
"""Missing chunk_index should raise ValidationError."""
from app.models.highlight import ChunkHighlightTarget
with pytest.raises(ValidationError):
ChunkHighlightTarget(
document_id="doc-123",
sub_question_text="test",
sub_question_index=0,
)
def test_negative_chunk_index_accepted(self):
"""Negative chunk_index should be accepted (no gt constraint)."""
from app.models.highlight import ChunkHighlightTarget
target = ChunkHighlightTarget(
document_id="doc-123",
chunk_index=-1,
sub_question_text="test",
sub_question_index=0,
)
assert target.chunk_index == -1
class TestHighlightBatchRequest:
"""Tests for HighlightBatchRequest model."""
def test_valid_with_multiple_targets(self):
"""Should accept a list of ChunkHighlightTarget objects."""
from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest
request = HighlightBatchRequest(
targets=[
ChunkHighlightTarget(
document_id="doc-123",
chunk_index=0,
sub_question_text="Q1",
sub_question_index=0,
),
ChunkHighlightTarget(
document_id="doc-456",
chunk_index=1,
sub_question_text="Q2",
sub_question_index=1,
),
]
)
assert len(request.targets) == 2
assert request.targets[0].document_id == "doc-123"
assert request.targets[1].document_id == "doc-456"
def test_empty_targets_accepted(self):
"""Empty targets list should be accepted."""
from app.models.highlight import HighlightBatchRequest
request = HighlightBatchRequest(targets=[])
assert request.targets == []
def test_missing_targets_rejected(self):
"""Missing targets field should raise ValidationError."""
from app.models.highlight import HighlightBatchRequest
with pytest.raises(ValidationError):
HighlightBatchRequest() # type: ignore
def test_invalid_target_type_rejected(self):
"""Non-ChunkHighlightTarget items should raise ValidationError."""
from app.models.highlight import HighlightBatchRequest
with pytest.raises(ValidationError):
HighlightBatchRequest(targets=["not a target"]) # type: ignore
class TestRelevantSentence:
"""Tests for RelevantSentence model."""
def test_valid_creation(self):
"""Should create a valid RelevantSentence with description fields."""
from app.models.highlight import RelevantSentence
rs = RelevantSentence(
sentence_index=3,
reason="Directly answers the sub-question",
)
assert rs.sentence_index == 3
assert rs.reason == "Directly answers the sub-question"
def test_reason_max_length_enforced(self):
"""Reason exceeding max_length=80 should raise ValidationError."""
from app.models.highlight import RelevantSentence
with pytest.raises(ValidationError, match="reason"):
RelevantSentence(
sentence_index=0,
reason="x" * 81,
)
def test_reason_at_max_length_accepted(self):
"""Reason exactly at max_length=80 should be accepted."""
from app.models.highlight import RelevantSentence
rs = RelevantSentence(
sentence_index=0,
reason="x" * 80,
)
assert len(rs.reason) == 80
def test_missing_sentence_index_rejected(self):
"""Missing sentence_index should raise ValidationError."""
from app.models.highlight import RelevantSentence
with pytest.raises(ValidationError):
RelevantSentence(reason="test")
def test_missing_reason_rejected(self):
"""Missing reason should raise ValidationError."""
from app.models.highlight import RelevantSentence
with pytest.raises(ValidationError):
RelevantSentence(sentence_index=0)
class TestChunkHighlights:
"""Tests for ChunkHighlights model."""
def test_valid_with_sentences(self):
"""Should create ChunkHighlights with relevant_sentences."""
from app.models.highlight import ChunkHighlights, RelevantSentence
ch = ChunkHighlights(
document_id="doc-123",
chunk_index=0,
relevant_sentences=[
RelevantSentence(sentence_index=1, reason="Key point"),
RelevantSentence(sentence_index=3, reason="Supports answer"),
],
)
assert ch.document_id == "doc-123"
assert ch.chunk_index == 0
assert len(ch.relevant_sentences) == 2
assert ch.relevant_sentences[0].sentence_index == 1
def test_default_empty_sentences(self):
"""Default relevant_sentences should be an empty list."""
from app.models.highlight import ChunkHighlights
ch = ChunkHighlights(
document_id="doc-123",
chunk_index=0,
)
assert ch.relevant_sentences == []
def test_explicit_empty_sentences(self):
"""Explicitly passing empty list should work."""
from app.models.highlight import ChunkHighlights
ch = ChunkHighlights(
document_id="doc-123",
chunk_index=0,
relevant_sentences=[],
)
assert ch.relevant_sentences == []
class TestHighlightBatchResult:
"""Tests for HighlightBatchResult model."""
def test_valid_with_results(self):
"""Should create HighlightBatchResult with ChunkHighlights list."""
from app.models.highlight import (
ChunkHighlights,
HighlightBatchResult,
RelevantSentence,
)
result = HighlightBatchResult(
results=[
ChunkHighlights(
document_id="doc-123",
chunk_index=0,
relevant_sentences=[
RelevantSentence(sentence_index=0, reason="First")
],
),
]
)
assert len(result.results) == 1
assert result.results[0].document_id == "doc-123"
def test_empty_results_accepted(self):
"""Empty results list should be accepted."""
from app.models.highlight import HighlightBatchResult
result = HighlightBatchResult(results=[])
assert result.results == []
def test_missing_results_rejected(self):
"""Missing results field should raise ValidationError."""
from app.models.highlight import HighlightBatchResult
with pytest.raises(ValidationError):
HighlightBatchResult() # type: ignore
class TestHighlightBatchResponse:
"""Tests for HighlightBatchResponse model."""
def test_status_completed(self):
"""Should accept 'completed' status."""
from app.models.highlight import HighlightBatchResponse
resp = HighlightBatchResponse(status="completed")
assert resp.status == "completed"
assert resp.cached_count == 0
assert resp.errors == []
def test_status_partial(self):
"""Should accept 'partial' status."""
from app.models.highlight import HighlightBatchResponse
resp = HighlightBatchResponse(status="partial", cached_count=2)
assert resp.status == "partial"
assert resp.cached_count == 2
def test_status_failed(self):
"""Should accept 'failed' status with errors."""
from app.models.highlight import HighlightBatchResponse
resp = HighlightBatchResponse(
status="failed",
errors=["document not found", "chunk out of range"],
)
assert resp.status == "failed"
assert len(resp.errors) == 2
def test_invalid_status_rejected(self):
"""Status not in Literal should raise ValidationError."""
from app.models.highlight import HighlightBatchResponse
with pytest.raises(ValidationError):
HighlightBatchResponse(status="unknown") # type: ignore
def test_default_cached_count(self):
"""Default cached_count should be 0."""
from app.models.highlight import HighlightBatchResponse
resp = HighlightBatchResponse(status="completed")
assert resp.cached_count == 0
def test_default_errors(self):
"""Default errors should be an empty list."""
from app.models.highlight import HighlightBatchResponse
resp = HighlightBatchResponse(status="completed")
assert resp.errors == []
class TestSerialization:
"""Tests for model_dump() serialization."""
def test_chunk_highlight_target_dump(self):
"""model_dump() should produce expected dict for ChunkHighlightTarget."""
from app.models.highlight import ChunkHighlightTarget
target = ChunkHighlightTarget(
document_id="doc-123",
chunk_index=5,
sub_question_text="What is the main topic?",
sub_question_index=0,
)
data = target.model_dump()
assert data == {
"document_id": "doc-123",
"chunk_index": 5,
"sub_question_text": "What is the main topic?",
"sub_question_index": 0,
}
def test_highlight_batch_request_dump(self):
"""model_dump() should produce expected nested dict."""
from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest
request = HighlightBatchRequest(
targets=[
ChunkHighlightTarget(
document_id="doc-123",
chunk_index=0,
sub_question_text="Q1",
sub_question_index=0,
),
]
)
data = request.model_dump()
assert data == {
"targets": [
{
"document_id": "doc-123",
"chunk_index": 0,
"sub_question_text": "Q1",
"sub_question_index": 0,
},
]
}
def test_chunk_highlights_dump(self):
"""model_dump() should include default empty list for relevant_sentences."""
from app.models.highlight import ChunkHighlights
ch = ChunkHighlights(document_id="doc-123", chunk_index=0)
data = ch.model_dump()
assert data == {
"document_id": "doc-123",
"chunk_index": 0,
"relevant_sentences": [],
}
def test_highlight_batch_response_dump(self):
"""model_dump() should produce expected dict with defaults."""
from app.models.highlight import HighlightBatchResponse
resp = HighlightBatchResponse(status="partial", cached_count=3)
data = resp.model_dump()
assert data == {
"status": "partial",
"cached_count": 3,
"errors": [],
}
def test_relevant_sentence_reason_max_length(self):
"""model_dump() should preserve reason at max length."""
from app.models.highlight import RelevantSentence
rs = RelevantSentence(sentence_index=0, reason="x" * 80)
data = rs.model_dump()
assert data["reason"] == "x" * 80

View File

@ -0,0 +1,101 @@
"""Phase 5 tests: Sentence splitter utility.
Tests for backend/app/utils/sentence_splitter.py covering:
- English sentence boundaries (., !, ?)
- Chinese sentence boundaries (, , )
- Mixed English/Chinese text
- Empty and whitespace-only input
- Single sentence without trailing punctuation
- Bullet list items
- Multiple newlines as sentence boundaries
- Trailing/leading whitespace handling
"""
import importlib.util
from pathlib import Path
# Dynamically load the sentence_splitter module to avoid package-path import issues.
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py"
spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH))
sentence_splitter = importlib.util.module_from_spec(spec) # type: ignore
assert spec is not None and spec.loader is not None
spec.loader.exec_module(sentence_splitter) # type: ignore
split_sentences = getattr(sentence_splitter, "split_sentences")
def test_empty_string_returns_empty_list():
assert split_sentences("") == []
def test_whitespace_only_returns_empty_list():
assert split_sentences(" ") == []
assert split_sentences("\n\t ") == []
def test_single_sentence_no_punctuation():
text = "Hello world"
result = split_sentences(text)
assert result == ["Hello world"]
def test_english_sentences():
text = "First sentence. Second sentence! Third sentence?"
result = split_sentences(text)
assert result == ["First sentence.", "Second sentence!", "Third sentence?"]
def test_chinese_sentences():
text = "第一句。第二句!第三句?"
result = split_sentences(text)
assert result == ["第一句。", "第二句!", "第三句?"]
def test_mixed_english_chinese():
text = "The date is 2024年1月1日。The Contractor shall start work on Monday."
result = split_sentences(text)
assert result == [
"The date is 2024年1月1日。",
"The Contractor shall start work on Monday.",
]
def test_bullet_list_items_with_periods():
text = "- Item one. - Item two. - Item three."
result = split_sentences(text)
assert result == ["- Item one.", "- Item two.", "- Item three."]
def test_multiple_newlines_as_boundaries():
text = "First paragraph\n\nSecond paragraph"
result = split_sentences(text)
assert result == ["First paragraph", "Second paragraph"]
def test_single_newline_does_not_split():
text = "Line one\nLine two"
result = split_sentences(text)
assert result == ["Line one\nLine two"]
def test_trailing_whitespace_handled():
text = "Hello world. \n\n Another sentence. "
result = split_sentences(text)
assert result == ["Hello world.", "Another sentence."]
def test_leading_whitespace_handled():
text = " Hello world. Another sentence."
result = split_sentences(text)
assert result == ["Hello world.", "Another sentence."]
def test_exclamation_and_question_marks():
text = "What is this? It is amazing! Really."
result = split_sentences(text)
assert result == ["What is this?", "It is amazing!", "Really."]
def test_chinese_mixed_punctuation():
text = "你好世界。How are you? 我很好!"
result = split_sentences(text)
assert result == ["你好,世界。", "How are you?", "我很好!"]

View File

@ -0,0 +1,8 @@
import re
def split_sentences(text: str) -> list[str]:
if not text or not text.strip():
return []
raw = re.split(r"(?<=[.!?。!?])\s*|(?<=\n)\s+", text)
return [s.strip() for s in raw if s.strip()]