"""Phase 5 tests: Sentence splitter utility. Tests for backend/app/utils/sentence_splitter.py covering: - English sentence boundaries (., !, ?) - Chinese sentence boundaries (。, !, ?) - Mixed English/Chinese text - Empty and whitespace-only input - Single sentence without trailing punctuation - Bullet list items - Multiple newlines as sentence boundaries - Trailing/leading whitespace handling """ import importlib.util from pathlib import Path # Dynamically load the sentence_splitter module to avoid package-path import issues. MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py" spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH)) sentence_splitter = importlib.util.module_from_spec(spec) # type: ignore assert spec is not None and spec.loader is not None spec.loader.exec_module(sentence_splitter) # type: ignore split_sentences = getattr(sentence_splitter, "split_sentences") def test_empty_string_returns_empty_list(): assert split_sentences("") == [] def test_whitespace_only_returns_empty_list(): assert split_sentences(" ") == [] assert split_sentences("\n\t ") == [] def test_single_sentence_no_punctuation(): text = "Hello world" result = split_sentences(text) assert result == ["Hello world"] def test_english_sentences(): text = "First sentence. Second sentence! Third sentence?" result = split_sentences(text) assert result == ["First sentence.", "Second sentence!", "Third sentence?"] def test_chinese_sentences(): text = "第一句。第二句!第三句?" result = split_sentences(text) assert result == ["第一句。", "第二句!", "第三句?"] def test_mixed_english_chinese(): text = "The date is 2024年1月1日。The Contractor shall start work on Monday." result = split_sentences(text) assert result == [ "The date is 2024年1月1日。", "The Contractor shall start work on Monday.", ] def test_bullet_list_items_with_periods(): text = "- Item one. - Item two. - Item three." result = split_sentences(text) assert result == ["- Item one.", "- Item two.", "- Item three."] def test_multiple_newlines_as_boundaries(): text = "First paragraph\n\nSecond paragraph" result = split_sentences(text) assert result == ["First paragraph", "Second paragraph"] def test_single_newline_does_not_split(): text = "Line one\nLine two" result = split_sentences(text) assert result == ["Line one\nLine two"] def test_trailing_whitespace_handled(): text = "Hello world. \n\n Another sentence. " result = split_sentences(text) assert result == ["Hello world.", "Another sentence."] def test_leading_whitespace_handled(): text = " Hello world. Another sentence." result = split_sentences(text) assert result == ["Hello world.", "Another sentence."] def test_exclamation_and_question_marks(): text = "What is this? It is amazing! Really." result = split_sentences(text) assert result == ["What is this?", "It is amazing!", "Really."] def test_chinese_mixed_punctuation(): text = "你好,世界。How are you? 我很好!" result = split_sentences(text) assert result == ["你好,世界。", "How are you?", "我很好!"]