feat: add sentence splitter and highlight data models (Phase 5.4.1-5.4.2)
- sentence_splitter.py: regex-based sentence splitting for English + Chinese punctuation - highlight.py: 6 Pydantic models (ChunkHighlightTarget, HighlightBatchRequest, RelevantSentence, ChunkHighlights, HighlightBatchResult, HighlightBatchResponse) - 43 tests: 13 sentence splitter + 30 model validation
This commit is contained in:
parent
ec3b5a4ae1
commit
b11d31e2d1
|
|
@ -0,0 +1,38 @@
|
|||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ChunkHighlightTarget(BaseModel):
|
||||
document_id: str
|
||||
chunk_index: int
|
||||
sub_question_text: str
|
||||
sub_question_index: int
|
||||
|
||||
|
||||
class HighlightBatchRequest(BaseModel):
|
||||
targets: list[ChunkHighlightTarget]
|
||||
|
||||
|
||||
class RelevantSentence(BaseModel):
|
||||
sentence_index: int = Field(description="0-based index of the relevant sentence")
|
||||
reason: str = Field(
|
||||
description="Brief explanation of why this sentence is relevant",
|
||||
max_length=80,
|
||||
)
|
||||
|
||||
|
||||
class ChunkHighlights(BaseModel):
|
||||
document_id: str
|
||||
chunk_index: int
|
||||
relevant_sentences: list[RelevantSentence] = Field(default_factory=list)
|
||||
|
||||
|
||||
class HighlightBatchResult(BaseModel):
|
||||
results: list[ChunkHighlights]
|
||||
|
||||
|
||||
class HighlightBatchResponse(BaseModel):
|
||||
status: Literal["completed", "partial", "failed"]
|
||||
cached_count: int = 0
|
||||
errors: list[str] = Field(default_factory=list)
|
||||
|
|
@ -0,0 +1,375 @@
|
|||
"""Tests for Phase 5.4 Highlight Pydantic models.
|
||||
|
||||
Validates ChunkHighlightTarget, HighlightBatchRequest, RelevantSentence,
|
||||
ChunkHighlights, HighlightBatchResult, and HighlightBatchResponse models.
|
||||
Ensures correct validation, defaults, and serialization.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
|
||||
class TestChunkHighlightTarget:
|
||||
"""Tests for ChunkHighlightTarget model."""
|
||||
|
||||
def test_valid_creation(self):
|
||||
"""Should create a valid ChunkHighlightTarget with all fields."""
|
||||
from app.models.highlight import ChunkHighlightTarget
|
||||
|
||||
target = ChunkHighlightTarget(
|
||||
document_id="doc-123",
|
||||
chunk_index=5,
|
||||
sub_question_text="What is the main topic?",
|
||||
sub_question_index=0,
|
||||
)
|
||||
assert target.document_id == "doc-123"
|
||||
assert target.chunk_index == 5
|
||||
assert target.sub_question_text == "What is the main topic?"
|
||||
assert target.sub_question_index == 0
|
||||
|
||||
def test_missing_document_id_rejected(self):
|
||||
"""Missing document_id should raise ValidationError."""
|
||||
from app.models.highlight import ChunkHighlightTarget
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
ChunkHighlightTarget(
|
||||
chunk_index=0,
|
||||
sub_question_text="test",
|
||||
sub_question_index=0,
|
||||
)
|
||||
|
||||
def test_missing_chunk_index_rejected(self):
|
||||
"""Missing chunk_index should raise ValidationError."""
|
||||
from app.models.highlight import ChunkHighlightTarget
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
ChunkHighlightTarget(
|
||||
document_id="doc-123",
|
||||
sub_question_text="test",
|
||||
sub_question_index=0,
|
||||
)
|
||||
|
||||
def test_negative_chunk_index_accepted(self):
|
||||
"""Negative chunk_index should be accepted (no gt constraint)."""
|
||||
from app.models.highlight import ChunkHighlightTarget
|
||||
|
||||
target = ChunkHighlightTarget(
|
||||
document_id="doc-123",
|
||||
chunk_index=-1,
|
||||
sub_question_text="test",
|
||||
sub_question_index=0,
|
||||
)
|
||||
assert target.chunk_index == -1
|
||||
|
||||
|
||||
class TestHighlightBatchRequest:
|
||||
"""Tests for HighlightBatchRequest model."""
|
||||
|
||||
def test_valid_with_multiple_targets(self):
|
||||
"""Should accept a list of ChunkHighlightTarget objects."""
|
||||
from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest
|
||||
|
||||
request = HighlightBatchRequest(
|
||||
targets=[
|
||||
ChunkHighlightTarget(
|
||||
document_id="doc-123",
|
||||
chunk_index=0,
|
||||
sub_question_text="Q1",
|
||||
sub_question_index=0,
|
||||
),
|
||||
ChunkHighlightTarget(
|
||||
document_id="doc-456",
|
||||
chunk_index=1,
|
||||
sub_question_text="Q2",
|
||||
sub_question_index=1,
|
||||
),
|
||||
]
|
||||
)
|
||||
assert len(request.targets) == 2
|
||||
assert request.targets[0].document_id == "doc-123"
|
||||
assert request.targets[1].document_id == "doc-456"
|
||||
|
||||
def test_empty_targets_accepted(self):
|
||||
"""Empty targets list should be accepted."""
|
||||
from app.models.highlight import HighlightBatchRequest
|
||||
|
||||
request = HighlightBatchRequest(targets=[])
|
||||
assert request.targets == []
|
||||
|
||||
def test_missing_targets_rejected(self):
|
||||
"""Missing targets field should raise ValidationError."""
|
||||
from app.models.highlight import HighlightBatchRequest
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
HighlightBatchRequest() # type: ignore
|
||||
|
||||
def test_invalid_target_type_rejected(self):
|
||||
"""Non-ChunkHighlightTarget items should raise ValidationError."""
|
||||
from app.models.highlight import HighlightBatchRequest
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
HighlightBatchRequest(targets=["not a target"]) # type: ignore
|
||||
|
||||
|
||||
class TestRelevantSentence:
|
||||
"""Tests for RelevantSentence model."""
|
||||
|
||||
def test_valid_creation(self):
|
||||
"""Should create a valid RelevantSentence with description fields."""
|
||||
from app.models.highlight import RelevantSentence
|
||||
|
||||
rs = RelevantSentence(
|
||||
sentence_index=3,
|
||||
reason="Directly answers the sub-question",
|
||||
)
|
||||
assert rs.sentence_index == 3
|
||||
assert rs.reason == "Directly answers the sub-question"
|
||||
|
||||
def test_reason_max_length_enforced(self):
|
||||
"""Reason exceeding max_length=80 should raise ValidationError."""
|
||||
from app.models.highlight import RelevantSentence
|
||||
|
||||
with pytest.raises(ValidationError, match="reason"):
|
||||
RelevantSentence(
|
||||
sentence_index=0,
|
||||
reason="x" * 81,
|
||||
)
|
||||
|
||||
def test_reason_at_max_length_accepted(self):
|
||||
"""Reason exactly at max_length=80 should be accepted."""
|
||||
from app.models.highlight import RelevantSentence
|
||||
|
||||
rs = RelevantSentence(
|
||||
sentence_index=0,
|
||||
reason="x" * 80,
|
||||
)
|
||||
assert len(rs.reason) == 80
|
||||
|
||||
def test_missing_sentence_index_rejected(self):
|
||||
"""Missing sentence_index should raise ValidationError."""
|
||||
from app.models.highlight import RelevantSentence
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
RelevantSentence(reason="test")
|
||||
|
||||
def test_missing_reason_rejected(self):
|
||||
"""Missing reason should raise ValidationError."""
|
||||
from app.models.highlight import RelevantSentence
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
RelevantSentence(sentence_index=0)
|
||||
|
||||
|
||||
class TestChunkHighlights:
|
||||
"""Tests for ChunkHighlights model."""
|
||||
|
||||
def test_valid_with_sentences(self):
|
||||
"""Should create ChunkHighlights with relevant_sentences."""
|
||||
from app.models.highlight import ChunkHighlights, RelevantSentence
|
||||
|
||||
ch = ChunkHighlights(
|
||||
document_id="doc-123",
|
||||
chunk_index=0,
|
||||
relevant_sentences=[
|
||||
RelevantSentence(sentence_index=1, reason="Key point"),
|
||||
RelevantSentence(sentence_index=3, reason="Supports answer"),
|
||||
],
|
||||
)
|
||||
assert ch.document_id == "doc-123"
|
||||
assert ch.chunk_index == 0
|
||||
assert len(ch.relevant_sentences) == 2
|
||||
assert ch.relevant_sentences[0].sentence_index == 1
|
||||
|
||||
def test_default_empty_sentences(self):
|
||||
"""Default relevant_sentences should be an empty list."""
|
||||
from app.models.highlight import ChunkHighlights
|
||||
|
||||
ch = ChunkHighlights(
|
||||
document_id="doc-123",
|
||||
chunk_index=0,
|
||||
)
|
||||
assert ch.relevant_sentences == []
|
||||
|
||||
def test_explicit_empty_sentences(self):
|
||||
"""Explicitly passing empty list should work."""
|
||||
from app.models.highlight import ChunkHighlights
|
||||
|
||||
ch = ChunkHighlights(
|
||||
document_id="doc-123",
|
||||
chunk_index=0,
|
||||
relevant_sentences=[],
|
||||
)
|
||||
assert ch.relevant_sentences == []
|
||||
|
||||
|
||||
class TestHighlightBatchResult:
|
||||
"""Tests for HighlightBatchResult model."""
|
||||
|
||||
def test_valid_with_results(self):
|
||||
"""Should create HighlightBatchResult with ChunkHighlights list."""
|
||||
from app.models.highlight import (
|
||||
ChunkHighlights,
|
||||
HighlightBatchResult,
|
||||
RelevantSentence,
|
||||
)
|
||||
|
||||
result = HighlightBatchResult(
|
||||
results=[
|
||||
ChunkHighlights(
|
||||
document_id="doc-123",
|
||||
chunk_index=0,
|
||||
relevant_sentences=[
|
||||
RelevantSentence(sentence_index=0, reason="First")
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
assert len(result.results) == 1
|
||||
assert result.results[0].document_id == "doc-123"
|
||||
|
||||
def test_empty_results_accepted(self):
|
||||
"""Empty results list should be accepted."""
|
||||
from app.models.highlight import HighlightBatchResult
|
||||
|
||||
result = HighlightBatchResult(results=[])
|
||||
assert result.results == []
|
||||
|
||||
def test_missing_results_rejected(self):
|
||||
"""Missing results field should raise ValidationError."""
|
||||
from app.models.highlight import HighlightBatchResult
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
HighlightBatchResult() # type: ignore
|
||||
|
||||
|
||||
class TestHighlightBatchResponse:
|
||||
"""Tests for HighlightBatchResponse model."""
|
||||
|
||||
def test_status_completed(self):
|
||||
"""Should accept 'completed' status."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
resp = HighlightBatchResponse(status="completed")
|
||||
assert resp.status == "completed"
|
||||
assert resp.cached_count == 0
|
||||
assert resp.errors == []
|
||||
|
||||
def test_status_partial(self):
|
||||
"""Should accept 'partial' status."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
resp = HighlightBatchResponse(status="partial", cached_count=2)
|
||||
assert resp.status == "partial"
|
||||
assert resp.cached_count == 2
|
||||
|
||||
def test_status_failed(self):
|
||||
"""Should accept 'failed' status with errors."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
resp = HighlightBatchResponse(
|
||||
status="failed",
|
||||
errors=["document not found", "chunk out of range"],
|
||||
)
|
||||
assert resp.status == "failed"
|
||||
assert len(resp.errors) == 2
|
||||
|
||||
def test_invalid_status_rejected(self):
|
||||
"""Status not in Literal should raise ValidationError."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
HighlightBatchResponse(status="unknown") # type: ignore
|
||||
|
||||
def test_default_cached_count(self):
|
||||
"""Default cached_count should be 0."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
resp = HighlightBatchResponse(status="completed")
|
||||
assert resp.cached_count == 0
|
||||
|
||||
def test_default_errors(self):
|
||||
"""Default errors should be an empty list."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
resp = HighlightBatchResponse(status="completed")
|
||||
assert resp.errors == []
|
||||
|
||||
|
||||
class TestSerialization:
|
||||
"""Tests for model_dump() serialization."""
|
||||
|
||||
def test_chunk_highlight_target_dump(self):
|
||||
"""model_dump() should produce expected dict for ChunkHighlightTarget."""
|
||||
from app.models.highlight import ChunkHighlightTarget
|
||||
|
||||
target = ChunkHighlightTarget(
|
||||
document_id="doc-123",
|
||||
chunk_index=5,
|
||||
sub_question_text="What is the main topic?",
|
||||
sub_question_index=0,
|
||||
)
|
||||
data = target.model_dump()
|
||||
assert data == {
|
||||
"document_id": "doc-123",
|
||||
"chunk_index": 5,
|
||||
"sub_question_text": "What is the main topic?",
|
||||
"sub_question_index": 0,
|
||||
}
|
||||
|
||||
def test_highlight_batch_request_dump(self):
|
||||
"""model_dump() should produce expected nested dict."""
|
||||
from app.models.highlight import ChunkHighlightTarget, HighlightBatchRequest
|
||||
|
||||
request = HighlightBatchRequest(
|
||||
targets=[
|
||||
ChunkHighlightTarget(
|
||||
document_id="doc-123",
|
||||
chunk_index=0,
|
||||
sub_question_text="Q1",
|
||||
sub_question_index=0,
|
||||
),
|
||||
]
|
||||
)
|
||||
data = request.model_dump()
|
||||
assert data == {
|
||||
"targets": [
|
||||
{
|
||||
"document_id": "doc-123",
|
||||
"chunk_index": 0,
|
||||
"sub_question_text": "Q1",
|
||||
"sub_question_index": 0,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
def test_chunk_highlights_dump(self):
|
||||
"""model_dump() should include default empty list for relevant_sentences."""
|
||||
from app.models.highlight import ChunkHighlights
|
||||
|
||||
ch = ChunkHighlights(document_id="doc-123", chunk_index=0)
|
||||
data = ch.model_dump()
|
||||
assert data == {
|
||||
"document_id": "doc-123",
|
||||
"chunk_index": 0,
|
||||
"relevant_sentences": [],
|
||||
}
|
||||
|
||||
def test_highlight_batch_response_dump(self):
|
||||
"""model_dump() should produce expected dict with defaults."""
|
||||
from app.models.highlight import HighlightBatchResponse
|
||||
|
||||
resp = HighlightBatchResponse(status="partial", cached_count=3)
|
||||
data = resp.model_dump()
|
||||
assert data == {
|
||||
"status": "partial",
|
||||
"cached_count": 3,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
def test_relevant_sentence_reason_max_length(self):
|
||||
"""model_dump() should preserve reason at max length."""
|
||||
from app.models.highlight import RelevantSentence
|
||||
|
||||
rs = RelevantSentence(sentence_index=0, reason="x" * 80)
|
||||
data = rs.model_dump()
|
||||
assert data["reason"] == "x" * 80
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
"""Phase 5 tests: Sentence splitter utility.
|
||||
|
||||
Tests for backend/app/utils/sentence_splitter.py covering:
|
||||
- English sentence boundaries (., !, ?)
|
||||
- Chinese sentence boundaries (。, !, ?)
|
||||
- Mixed English/Chinese text
|
||||
- Empty and whitespace-only input
|
||||
- Single sentence without trailing punctuation
|
||||
- Bullet list items
|
||||
- Multiple newlines as sentence boundaries
|
||||
- Trailing/leading whitespace handling
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
|
||||
# Dynamically load the sentence_splitter module to avoid package-path import issues.
|
||||
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py"
|
||||
spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH))
|
||||
sentence_splitter = importlib.util.module_from_spec(spec) # type: ignore
|
||||
assert spec is not None and spec.loader is not None
|
||||
spec.loader.exec_module(sentence_splitter) # type: ignore
|
||||
split_sentences = getattr(sentence_splitter, "split_sentences")
|
||||
|
||||
|
||||
def test_empty_string_returns_empty_list():
|
||||
assert split_sentences("") == []
|
||||
|
||||
|
||||
def test_whitespace_only_returns_empty_list():
|
||||
assert split_sentences(" ") == []
|
||||
assert split_sentences("\n\t ") == []
|
||||
|
||||
|
||||
def test_single_sentence_no_punctuation():
|
||||
text = "Hello world"
|
||||
result = split_sentences(text)
|
||||
assert result == ["Hello world"]
|
||||
|
||||
|
||||
def test_english_sentences():
|
||||
text = "First sentence. Second sentence! Third sentence?"
|
||||
result = split_sentences(text)
|
||||
assert result == ["First sentence.", "Second sentence!", "Third sentence?"]
|
||||
|
||||
|
||||
def test_chinese_sentences():
|
||||
text = "第一句。第二句!第三句?"
|
||||
result = split_sentences(text)
|
||||
assert result == ["第一句。", "第二句!", "第三句?"]
|
||||
|
||||
|
||||
def test_mixed_english_chinese():
|
||||
text = "The date is 2024年1月1日。The Contractor shall start work on Monday."
|
||||
result = split_sentences(text)
|
||||
assert result == [
|
||||
"The date is 2024年1月1日。",
|
||||
"The Contractor shall start work on Monday.",
|
||||
]
|
||||
|
||||
|
||||
def test_bullet_list_items_with_periods():
|
||||
text = "- Item one. - Item two. - Item three."
|
||||
result = split_sentences(text)
|
||||
assert result == ["- Item one.", "- Item two.", "- Item three."]
|
||||
|
||||
|
||||
def test_multiple_newlines_as_boundaries():
|
||||
text = "First paragraph\n\nSecond paragraph"
|
||||
result = split_sentences(text)
|
||||
assert result == ["First paragraph", "Second paragraph"]
|
||||
|
||||
|
||||
def test_single_newline_does_not_split():
|
||||
text = "Line one\nLine two"
|
||||
result = split_sentences(text)
|
||||
assert result == ["Line one\nLine two"]
|
||||
|
||||
|
||||
def test_trailing_whitespace_handled():
|
||||
text = "Hello world. \n\n Another sentence. "
|
||||
result = split_sentences(text)
|
||||
assert result == ["Hello world.", "Another sentence."]
|
||||
|
||||
|
||||
def test_leading_whitespace_handled():
|
||||
text = " Hello world. Another sentence."
|
||||
result = split_sentences(text)
|
||||
assert result == ["Hello world.", "Another sentence."]
|
||||
|
||||
|
||||
def test_exclamation_and_question_marks():
|
||||
text = "What is this? It is amazing! Really."
|
||||
result = split_sentences(text)
|
||||
assert result == ["What is this?", "It is amazing!", "Really."]
|
||||
|
||||
|
||||
def test_chinese_mixed_punctuation():
|
||||
text = "你好,世界。How are you? 我很好!"
|
||||
result = split_sentences(text)
|
||||
assert result == ["你好,世界。", "How are you?", "我很好!"]
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
import re
|
||||
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
raw = re.split(r"(?<=[.!?。!?])\s*|(?<=\n)\s+", text)
|
||||
return [s.strip() for s in raw if s.strip()]
|
||||
Loading…
Reference in New Issue