legco_ai_assistant/backend/app/utils/chunking.py

"""Chunking utilities for Phase 1.2.

Provides an abstract ChunkingStrategy and a concrete
TokenChunkingStrategy that uses tiktoken to chunk text into
token-based windows.
"""
from __future__ import annotations

import logging
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional, Tuple

if TYPE_CHECKING:
    from app.core.config import Settings
    from app.services.llm_client import LLMClient

logger = logging.getLogger(__name__)


class ChunkingStrategy(ABC):
    """Abstract base class for text chunking strategies."""

    @abstractmethod
    def chunk(self, text: str) -> List[str]:
        """Split text into a list of chunks (strings).

        Implementations should return an empty list for empty or whitespace-only
        input. The output chunks should be non-overlapping in terms of the produced
        sequence when considering the token boundaries, but may overlap in raw text
        due to token-based windowing.
        """
        raise NotImplementedError


class TokenChunkingStrategy(ChunkingStrategy):
    """Chunk text by token windows using the tiktoken encoder.

    The strategy operates on token counts: each chunk contains up to
    chunk_size tokens with overlap of overlap tokens between consecutive chunks.
    """

    def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"):
        if chunk_size <= 0:
            raise ValueError("chunk_size must be positive")
        if overlap < 0:
            raise ValueError("overlap must be non-negative")
        self.chunk_size = chunk_size
        self.overlap = overlap
        # Lazy import to avoid import-time penalties in environments without tokenizers
        import tiktoken

        self._encoding = tiktoken.get_encoding(encoding_name)

    def chunk(self, text: str) -> List[str]:
        if not isinstance(text, str):
            raise TypeError("text must be a string")
        if text.strip() == "":
            return []

        # Tokenize the input text
        tokens = self._encoding.encode(text)
        if not tokens:
            return []

        chunks: List[str] = []
        step = self.chunk_size - self.overlap
        if step <= 0:
            step = 1  # ensure progress even with extreme overlap

        for i in range(0, len(tokens), step):
            segment = tokens[i : i + self.chunk_size]
            if not segment:
                break
            chunk_text = self._encoding.decode(segment)
            chunks.append(chunk_text)
            # If we reached the end of the token array, break early
            if len(segment) < self.chunk_size:
                break

        return chunks

    def chunk_pages(
        self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
    ) -> List[Tuple[str, int]]:
        """Chunk page-segmented text with overlap from adjacent pages.

        For each page, creates one chunk containing:
          [last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page]

        One chunk per page — never splits a page even if oversized.
        The page_number metadata always refers to the main page (N), not overlap pages.

        Args:
            pages: List of (page_number, page_text) tuples. 1-indexed.
            overlap_tokens: Number of tokens to include from adjacent pages.

        Returns:
            List of (chunk_text, page_number) tuples. One chunk per page.
        """
        if not pages:
            return []

        tokenized: List[List[int]] = [
            self._encoding.encode(text) for _, text in pages
        ]

        results: List[Tuple[str, int]] = []

        for i, (page_num, page_text) in enumerate(pages):
            parts: List[str] = []

            if i > 0 and overlap_tokens > 0:
                prev_tokens = tokenized[i - 1]
                overlap_before = prev_tokens[-overlap_tokens:]
                parts.append(self._encoding.decode(overlap_before))

            parts.append(page_text)

            if i < len(pages) - 1 and overlap_tokens > 0:
                next_tokens = tokenized[i + 1]
                overlap_after = next_tokens[:overlap_tokens]
                parts.append(self._encoding.decode(overlap_after))

            results.append(("\n".join(parts), page_num))

        return results


class QuestionChunkingStrategy(ChunkingStrategy):
    """Chunk text by detecting Q&A structure using LLM and/or regex patterns.

    Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers.
    Falls back to section-based chunking for narrative-only documents.
    """

    def __init__(
        self,
        settings: "Settings",
        llm_client: Optional["LLMClient"] = None,
    ):
        self._settings = settings
        self._llm_client = llm_client
        self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
        self._chunk_metadata: List[dict] = []

    def chunk(self, text: str) -> List[str]:
        """Split text into chunks using Q&A detection (for DOCX/TXT)."""
        if not text or not text.strip():
            return []

        from app.utils.qa_chunking import (
            split_chinese_qa,
            split_english_qa,
            build_chunks_from_sections,
            Section,
        )

        sections = split_chinese_qa(text)
        if not sections:
            sections = split_english_qa(text)

        if not sections:
            sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]

        results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
        self._chunk_metadata = [meta for _, _, meta in results]
        return [chunk_text for chunk_text, _, _ in results]

    def chunk_pages(
        self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
    ) -> List[Tuple[str, int]]:
        """Split page-segmented text using Q&A detection (for PDF).

        Returns list of (chunk_text, page_number) where page_number
        references the question location for Q&A chunks.
        """
        if not pages:
            return []

        from app.utils.qa_chunking import (
            preprocess_text,
            split_chinese_qa,
            split_english_qa,
            build_chunks_from_sections,
            parse_llm_structure_response,
            build_structure_detection_prompt,
            Section,
        )

        full_text = preprocess_text(pages)

        sections = split_chinese_qa(full_text)
        if not sections:
            sections = split_english_qa(full_text)

        if not sections and self._llm_client is not None:
            import asyncio
            prompt = build_structure_detection_prompt(full_text)
            try:
                loop = asyncio.get_event_loop()
                if loop.is_running():
                    sections = []
                else:
                    response = loop.run_until_complete(
                        self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
                    )
                    sections = parse_llm_structure_response(response)
            except Exception:
                logger.warning("LLM structure detection failed, using fallback", exc_info=True)

        if not sections:
            sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]

        results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
        self._chunk_metadata = [meta for _, _, meta in results]
        return [(chunk_text, page_num) for chunk_text, page_num, _ in results]


def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
    """Factory: return the named chunking strategy.

    Args:
        name: "token" or "question"
        settings: Application settings instance.

    Returns:
        ChunkingStrategy instance.
    """
    if name == "question":
        return QuestionChunkingStrategy(settings=settings)
    return TokenChunkingStrategy(
        chunk_size=settings.chunk_size,
        overlap=settings.chunk_overlap,
    )