legco_ai_assistant/backend/app/test/test_phase5_sentence_splitt...

102 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Phase 5 tests: Sentence splitter utility.
Tests for backend/app/utils/sentence_splitter.py covering:
- English sentence boundaries (., !, ?)
- Chinese sentence boundaries (。, , )
- Mixed English/Chinese text
- Empty and whitespace-only input
- Single sentence without trailing punctuation
- Bullet list items
- Multiple newlines as sentence boundaries
- Trailing/leading whitespace handling
"""
import importlib.util
from pathlib import Path
# Dynamically load the sentence_splitter module to avoid package-path import issues.
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py"
spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH))
sentence_splitter = importlib.util.module_from_spec(spec) # type: ignore
assert spec is not None and spec.loader is not None
spec.loader.exec_module(sentence_splitter) # type: ignore
split_sentences = getattr(sentence_splitter, "split_sentences")
def test_empty_string_returns_empty_list():
assert split_sentences("") == []
def test_whitespace_only_returns_empty_list():
assert split_sentences(" ") == []
assert split_sentences("\n\t ") == []
def test_single_sentence_no_punctuation():
text = "Hello world"
result = split_sentences(text)
assert result == ["Hello world"]
def test_english_sentences():
text = "First sentence. Second sentence! Third sentence?"
result = split_sentences(text)
assert result == ["First sentence.", "Second sentence!", "Third sentence?"]
def test_chinese_sentences():
text = "第一句。第二句!第三句?"
result = split_sentences(text)
assert result == ["第一句。", "第二句!", "第三句?"]
def test_mixed_english_chinese():
text = "The date is 2024年1月1日。The Contractor shall start work on Monday."
result = split_sentences(text)
assert result == [
"The date is 2024年1月1日。",
"The Contractor shall start work on Monday.",
]
def test_bullet_list_items_with_periods():
text = "- Item one. - Item two. - Item three."
result = split_sentences(text)
assert result == ["- Item one.", "- Item two.", "- Item three."]
def test_multiple_newlines_as_boundaries():
text = "First paragraph\n\nSecond paragraph"
result = split_sentences(text)
assert result == ["First paragraph", "Second paragraph"]
def test_single_newline_does_not_split():
text = "Line one\nLine two"
result = split_sentences(text)
assert result == ["Line one\nLine two"]
def test_trailing_whitespace_handled():
text = "Hello world. \n\n Another sentence. "
result = split_sentences(text)
assert result == ["Hello world.", "Another sentence."]
def test_leading_whitespace_handled():
text = " Hello world. Another sentence."
result = split_sentences(text)
assert result == ["Hello world.", "Another sentence."]
def test_exclamation_and_question_marks():
text = "What is this? It is amazing! Really."
result = split_sentences(text)
assert result == ["What is this?", "It is amazing!", "Really."]
def test_chinese_mixed_punctuation():
text = "你好世界。How are you? 我很好!"
result = split_sentences(text)
assert result == ["你好,世界。", "How are you?", "我很好!"]