legco_ai_assistant/backend/app/test/test_phase5_sentence_splitt...

"""Phase 5 tests: Sentence splitter utility.

Tests for backend/app/utils/sentence_splitter.py covering:
- English sentence boundaries (., !, ?)
- Chinese sentence boundaries (。, ！, ？)
- Mixed English/Chinese text
- Empty and whitespace-only input
- Single sentence without trailing punctuation
- Bullet list items
- Multiple newlines as sentence boundaries
- Trailing/leading whitespace handling
"""

import importlib.util
from pathlib import Path

# Dynamically load the sentence_splitter module to avoid package-path import issues.
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "sentence_splitter.py"
spec = importlib.util.spec_from_file_location("sentence_splitter", str(MODULE_PATH))
sentence_splitter = importlib.util.module_from_spec(spec)  # type: ignore
assert spec is not None and spec.loader is not None
spec.loader.exec_module(sentence_splitter)  # type: ignore
split_sentences = getattr(sentence_splitter, "split_sentences")


def test_empty_string_returns_empty_list():
    assert split_sentences("") == []


def test_whitespace_only_returns_empty_list():
    assert split_sentences("   ") == []
    assert split_sentences("\n\t  ") == []


def test_single_sentence_no_punctuation():
    text = "Hello world"
    result = split_sentences(text)
    assert result == ["Hello world"]


def test_english_sentences():
    text = "First sentence. Second sentence! Third sentence?"
    result = split_sentences(text)
    assert result == ["First sentence.", "Second sentence!", "Third sentence?"]


def test_chinese_sentences():
    text = "第一句。第二句！第三句？"
    result = split_sentences(text)
    assert result == ["第一句。", "第二句！", "第三句？"]


def test_mixed_english_chinese():
    text = "The date is 2024年1月1日。The Contractor shall start work on Monday."
    result = split_sentences(text)
    assert result == [
        "The date is 2024年1月1日。",
        "The Contractor shall start work on Monday.",
    ]


def test_bullet_list_items_with_periods():
    text = "- Item one. - Item two. - Item three."
    result = split_sentences(text)
    assert result == ["- Item one.", "- Item two.", "- Item three."]


def test_multiple_newlines_as_boundaries():
    text = "First paragraph\n\nSecond paragraph"
    result = split_sentences(text)
    assert result == ["First paragraph", "Second paragraph"]


def test_single_newline_does_not_split():
    text = "Line one\nLine two"
    result = split_sentences(text)
    assert result == ["Line one\nLine two"]


def test_trailing_whitespace_handled():
    text = "Hello world.   \n\n  Another sentence.  "
    result = split_sentences(text)
    assert result == ["Hello world.", "Another sentence."]


def test_leading_whitespace_handled():
    text = "   Hello world. Another sentence."
    result = split_sentences(text)
    assert result == ["Hello world.", "Another sentence."]


def test_exclamation_and_question_marks():
    text = "What is this? It is amazing! Really."
    result = split_sentences(text)
    assert result == ["What is this?", "It is amazing!", "Really."]


def test_chinese_mixed_punctuation():
    text = "你好，世界。How are you? 我很好！"
    result = split_sentences(text)
    assert result == ["你好，世界。", "How are you?", "我很好！"]