102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
"""Phase 5 tests: Sentence splitter utility.
|
||
|
||
Tests for backend/app/utils/sentence_splitter.py covering:
|
||
- English sentence boundaries (., !, ?)
|
||
- Chinese sentence boundaries (。, !, ?)
|
||
- Mixed English/Chinese text
|
||
- Empty and whitespace-only input
|
||
- Single sentence without trailing punctuation
|
||
- Bullet list items
|
||
- Multiple newlines as sentence boundaries
|
||
- Trailing/leading whitespace handling
|
||
"""
|
||
|
||
import importlib.util
|
||
from pathlib import Path
|
||
|
||
# Dynamically load the sentence_splitter module to avoid package-path import issues.
|
||
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py"
|
||
spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH))
|
||
sentence_splitter = importlib.util.module_from_spec(spec) # type: ignore
|
||
assert spec is not None and spec.loader is not None
|
||
spec.loader.exec_module(sentence_splitter) # type: ignore
|
||
split_sentences = getattr(sentence_splitter, "split_sentences")
|
||
|
||
|
||
def test_empty_string_returns_empty_list():
|
||
assert split_sentences("") == []
|
||
|
||
|
||
def test_whitespace_only_returns_empty_list():
|
||
assert split_sentences(" ") == []
|
||
assert split_sentences("\n\t ") == []
|
||
|
||
|
||
def test_single_sentence_no_punctuation():
|
||
text = "Hello world"
|
||
result = split_sentences(text)
|
||
assert result == ["Hello world"]
|
||
|
||
|
||
def test_english_sentences():
|
||
text = "First sentence. Second sentence! Third sentence?"
|
||
result = split_sentences(text)
|
||
assert result == ["First sentence.", "Second sentence!", "Third sentence?"]
|
||
|
||
|
||
def test_chinese_sentences():
|
||
text = "第一句。第二句!第三句?"
|
||
result = split_sentences(text)
|
||
assert result == ["第一句。", "第二句!", "第三句?"]
|
||
|
||
|
||
def test_mixed_english_chinese():
|
||
text = "The date is 2024年1月1日。The Contractor shall start work on Monday."
|
||
result = split_sentences(text)
|
||
assert result == [
|
||
"The date is 2024年1月1日。",
|
||
"The Contractor shall start work on Monday.",
|
||
]
|
||
|
||
|
||
def test_bullet_list_items_with_periods():
|
||
text = "- Item one. - Item two. - Item three."
|
||
result = split_sentences(text)
|
||
assert result == ["- Item one.", "- Item two.", "- Item three."]
|
||
|
||
|
||
def test_multiple_newlines_as_boundaries():
|
||
text = "First paragraph\n\nSecond paragraph"
|
||
result = split_sentences(text)
|
||
assert result == ["First paragraph", "Second paragraph"]
|
||
|
||
|
||
def test_single_newline_does_not_split():
|
||
text = "Line one\nLine two"
|
||
result = split_sentences(text)
|
||
assert result == ["Line one\nLine two"]
|
||
|
||
|
||
def test_trailing_whitespace_handled():
|
||
text = "Hello world. \n\n Another sentence. "
|
||
result = split_sentences(text)
|
||
assert result == ["Hello world.", "Another sentence."]
|
||
|
||
|
||
def test_leading_whitespace_handled():
|
||
text = " Hello world. Another sentence."
|
||
result = split_sentences(text)
|
||
assert result == ["Hello world.", "Another sentence."]
|
||
|
||
|
||
def test_exclamation_and_question_marks():
|
||
text = "What is this? It is amazing! Really."
|
||
result = split_sentences(text)
|
||
assert result == ["What is this?", "It is amazing!", "Really."]
|
||
|
||
|
||
def test_chinese_mixed_punctuation():
|
||
text = "你好,世界。How are you? 我很好!"
|
||
result = split_sentences(text)
|
||
assert result == ["你好,世界。", "How are you?", "我很好!"]
|