From d94abaac77aa8443370135c3279cb494b51e8a79 Mon Sep 17 00:00:00 2001
From: Woody <woody.ck.tse@gmail.com>
Date: Wed, 22 Apr 2026 16:49:52 +0800
Subject: [PATCH] feat: Phase 1.2 ingestion pipeline with chunking and metadata

- Add document parsers (DOCX, PDF) with lazy imports
- Add TokenChunkingStrategy with ABC for future replacement
- Add metadata extraction (filename, upload_date, content_summary)
- Add RAGService for ChromaDB ingestion/retrieval/response generation
- Add POST /api/v1/ingest endpoint with file validation
- Test-first: 20 passed, 2 skipped (python-docx not installed)
---
 backend/app/main.py                           |   4 +
 backend/app/routers/ingest.py                 |  70 +++++++++
 backend/app/services/llm_client.py            |  28 ++++
 backend/app/services/rag.py                   | 138 ++++++++++++++++++
 .../test_acceptance_phase1_llm_client.py      |  26 ++++
 backend/app/test/test_phase1_chunking.py      |  61 ++++++--
 backend/app/test/test_phase1_ingest.py        |  94 ++++++++++--
 backend/app/test/test_phase1_metadata.py      |  78 +++++++---
 backend/app/test/test_phase1_parsers.py       |  67 +++++++++
 backend/app/test/test_phase1_rag_service.py   | 136 +++++++++++++++--
 backend/app/utils/chunking.py                 |  73 +++++++++
 backend/app/utils/docx_parser.py              |  35 +++++
 backend/app/utils/metadata.py                 |  53 +++++++
 backend/app/utils/pdf_parser.py               |  28 ++++
 backend/pytest.ini                            |   5 +
 15 files changed, 841 insertions(+), 55 deletions(-)
 create mode 100644 backend/app/routers/ingest.py
 create mode 100644 backend/app/services/llm_client.py
 create mode 100644 backend/app/services/rag.py
 create mode 100644 backend/app/test/acceptance/test_acceptance_phase1_llm_client.py
 create mode 100644 backend/app/test/test_phase1_parsers.py
 create mode 100644 backend/app/utils/chunking.py
 create mode 100644 backend/app/utils/docx_parser.py
 create mode 100644 backend/app/utils/metadata.py
 create mode 100644 backend/app/utils/pdf_parser.py
 create mode 100644 backend/pytest.ini

diff --git a/backend/app/main.py b/backend/app/main.py
index 2ebd0f3..a7bd010 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -1,6 +1,8 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 
+from app.routers import ingest
+
 app = FastAPI(title="RAG Video Q&A", version="1.0.0")
 
 app.add_middleware(
@@ -11,6 +13,8 @@ app.add_middleware(
     allow_headers=["*"],
 )
 
+app.include_router(ingest.router, prefix="/api/v1")
+
 
 @app.get("/health")
 def health_check():
diff --git a/backend/app/routers/ingest.py b/backend/app/routers/ingest.py
new file mode 100644
index 0000000..ee82a48
--- /dev/null
+++ b/backend/app/routers/ingest.py
@@ -0,0 +1,70 @@
+"""Document ingestion router."""
+import os
+import tempfile
+import uuid
+from pathlib import Path
+
+from fastapi import APIRouter, UploadFile, File, HTTPException
+
+from app.models.ingest import IngestResponse
+
+router = APIRouter(tags=["ingest"])
+
+SUPPORTED_EXTENSIONS = {".pdf", ".docx"}
+
+
+@router.post("/ingest", response_model=IngestResponse)
+async def ingest_document(file: UploadFile = File(...)):
+    """Ingest a document into the RAG system.
+
+    Accepts PDF and DOCX files, parses text, chunks, extracts metadata,
+    embeds, and stores in ChromaDB.
+    """
+    from app.services.rag import RAGService
+    from app.utils.chunking import TokenChunkingStrategy
+    from app.utils.metadata import extract_metadata
+
+    file_ext = Path(file.filename or "").suffix.lower()
+
+    if file_ext not in SUPPORTED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}",
+        )
+
+    temp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
+            content = await file.read()
+            tmp.write(content)
+            temp_path = tmp.name
+
+        if file_ext == ".pdf":
+            from app.utils.pdf_parser import parse_pdf
+            text = parse_pdf(temp_path)
+        elif file_ext == ".docx":
+            from app.utils.docx_parser import parse_docx
+            text = parse_docx(temp_path)
+        else:
+            text = ""
+
+        chunker = TokenChunkingStrategy(chunk_size=1000, overlap=200)
+        chunks = chunker.chunk(text)
+
+        metadata = extract_metadata(temp_path, chunks)
+
+        rag = RAGService()
+        document_id = rag.ingest_document(temp_path, chunks, metadata)
+
+        return IngestResponse(
+            document_id=document_id,
+            chunk_count=len(chunks),
+            filename=file.filename or "unknown",
+        )
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
+
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            os.unlink(temp_path)
diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py
new file mode 100644
index 0000000..0ccd29c
--- /dev/null
+++ b/backend/app/services/llm_client.py
@@ -0,0 +1,28 @@
+import httpx
+
+from app.core.config import Settings
+
+
+class LLMClient:
+    def __init__(self, settings: Settings):
+        self.base_url = settings.llm_base_url.rstrip("/")
+        self.api_key = settings.llm_api_key
+        self.model = settings.llm_model_name
+
+    def complete(self, prompt: str, temperature: float = 0.7) -> str:
+        response = httpx.post(
+            f"{self.base_url}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": temperature,
+            },
+            timeout=60.0,
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data["choices"][0]["message"]["content"]
diff --git a/backend/app/services/rag.py b/backend/app/services/rag.py
new file mode 100644
index 0000000..f580510
--- /dev/null
+++ b/backend/app/services/rag.py
@@ -0,0 +1,138 @@
+"""RAG service for embedding, retrieval, and response generation."""
+import uuid
+from typing import List, Tuple, Dict, Any, Optional
+
+import httpx
+
+from app.core.config import Settings
+from app.core.database import get_chroma_client
+
+
+class RAGService:
+    """Service for document ingestion, retrieval, and response generation."""
+
+    def __init__(
+        self,
+        chroma_client=None,
+        llm_client=None,
+        settings: Optional[Settings] = None,
+    ):
+        self.chroma_client = chroma_client or get_chroma_client()
+        self.llm_client = llm_client
+        self.settings = settings
+
+        self._collection = None
+
+    @property
+    def collection(self):
+        """Lazy-load the ChromaDB collection."""
+        if self._collection is None:
+            from app.core.database import get_or_create_collection
+            self._collection = get_or_create_collection(self.chroma_client, "documents")
+        return self._collection
+
+    def ingest_document(
+        self,
+        file_path: str,
+        chunks: List[str],
+        metadata_list: List[Dict[str, Any]],
+    ) -> str:
+        """Ingest document chunks into ChromaDB.
+
+        Args:
+            file_path: Path to the source file.
+            chunks: List of text chunks.
+            metadata_list: List of metadata dicts matching chunk count.
+
+        Returns:
+            Document ID (UUID) for the ingestion batch.
+        """
+        if not chunks:
+            return ""
+
+        document_id = str(uuid.uuid4())
+        ids = [f"{document_id}_{i}" for i in range(len(chunks))]
+
+        self.collection.add(
+            documents=chunks,
+            metadatas=metadata_list,
+            ids=ids,
+        )
+
+        return document_id
+
+    def retrieve(
+        self,
+        query_keywords: List[str],
+        n_results: int = 10,
+    ) -> List[Tuple[str, Dict[str, Any], float]]:
+        """Retrieve relevant chunks from ChromaDB.
+
+        Args:
+            query_keywords: List of keywords from query decomposition.
+            n_results: Maximum number of results to retrieve.
+
+        Returns:
+            List of (chunk_text, metadata, distance) tuples.
+        """
+        query_text = " ".join(query_keywords)
+
+        results = self.collection.query(
+            query_texts=[query_text],
+            n_results=n_results,
+        )
+
+        chunks = []
+        if results["documents"] and results["documents"][0]:
+            for i, doc in enumerate(results["documents"][0]):
+                metadata = results["metadatas"][0][i] if results["metadatas"][0] else {}
+                distance = results["distances"][0][i] if results["distances"][0] else 0.0
+                chunks.append((doc, metadata, distance))
+
+        return chunks
+
+    def generate_response(
+        self,
+        question: str,
+        chunks: List[str],
+        metadata_list: List[Dict[str, Any]],
+    ) -> str:
+        """Generate a bullet-point response using only provided chunks.
+
+        Args:
+            question: The user's question.
+            chunks: List of relevant document chunks.
+            metadata_list: List of metadata for each chunk.
+
+        Returns:
+            Bullet-point formatted answer string.
+        """
+        if not chunks:
+            return "I could not find any relevant information to answer your question."
+
+        if self.llm_client is None:
+            return "LLM client not configured."
+
+        context_parts = []
+        for i, (chunk, meta) in enumerate(zip(chunks, metadata_list)):
+            source = meta.get("filename", "unknown")
+            summary = meta.get("content_summary", "")
+            context_parts.append(
+                f"[{i + 1}] Source: {source}\n"
+                f"Summary: {summary}\n"
+                f"Content: {chunk}\n"
+            )
+
+        context = "\n".join(context_parts)
+
+        prompt = (
+            f"Question: {question}\n\n"
+            f"Answer the question using ONLY these document chunks. "
+            f"Do not use any external knowledge. "
+            f"Format your answer as bullet points. "
+            f"Cite the source number [N] for each point.\n\n"
+            f"Document chunks:\n{context}\n\n"
+            f"Answer:"
+        )
+
+        return self.llm_client.complete(prompt=prompt, temperature=0.3)
diff --git a/backend/app/test/acceptance/test_acceptance_phase1_llm_client.py b/backend/app/test/acceptance/test_acceptance_phase1_llm_client.py
new file mode 100644
index 0000000..c12ce63
--- /dev/null
+++ b/backend/app/test/acceptance/test_acceptance_phase1_llm_client.py
@@ -0,0 +1,26 @@
+"""Acceptance test: Verify LLM client can call OpenRouter API.
+
+Prerequisites:
+- backend/.env file exists with valid LLM_BASE_URL and LLM_API_KEY
+- Network access to OpenRouter API
+"""
+import pytest
+import os
+
+
+@pytest.mark.acceptance
+@pytest.mark.slow
+def test_llm_client_says_hi():
+    """Should send 'hi' to LLM and receive a non-empty response."""
+    from app.core.config import get_settings
+    from app.services.llm_client import LLMClient
+    
+    settings = get_settings()
+    client = LLMClient(settings)
+    
+    response = client.complete("Say hi briefly", temperature=0.7)
+    
+    assert response is not None
+    assert len(response) > 0
+    assert isinstance(response, str)
+    print(f"LLM Response: {response}")
diff --git a/backend/app/test/test_phase1_chunking.py b/backend/app/test/test_phase1_chunking.py
index 2665b6c..a870e23 100644
--- a/backend/app/test/test_phase1_chunking.py
+++ b/backend/app/test/test_phase1_chunking.py
@@ -1,24 +1,55 @@
 """Phase 1 tests: Document chunking utilities.
 
-Covers:
-- Text splitting strategies
-- Chunk size and overlap parameters
-- Handling of different document formats
+This file drives Test-First development for the chunking subsystem:
+- Abstract base interface for chunking strategies
+- Concrete TokenChunkingStrategy backed by tiktoken
+- Edge cases: empty input, whitespace-only input, small input
 """
+
+import importlib.util
+from pathlib import Path
 import pytest
 
+# Dynamically load the chunking module directly from the filesystem to avoid
+# import path issues in the test environment.
+CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py"
+spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH))
+chunking_module = importlib.util.module_from_spec(spec)  # type: ignore
+assert spec and spec.loader
+spec.loader.exec_module(chunking_module)  # type: ignore
+ChunkingStrategy = chunking_module.ChunkingStrategy
+TokenChunkingStrategy = chunking_module.TokenChunkingStrategy
 
-class TestChunking:
-    """Document chunking utility tests."""
 
-    def test_chunk_size_limit(self):
-        """Should respect maximum chunk size."""
-        pass  # TODO: implement
+def test_abstract_base_class_not_instantiable():
+    # Abstract base class should not be instantiable directly
+    with pytest.raises(TypeError):
+        ChunkingStrategy()  # type: ignore
 
-    def test_chunk_overlap(self):
-        """Should include overlap between adjacent chunks."""
-        pass  # TODO: implement
 
-    def test_empty_document(self):
-        """Should handle empty or whitespace-only documents."""
-        pass  # TODO: implement
+def test_empty_and_whitespace_inputs_yield_no_chunks():
+    strat = TokenChunkingStrategy()
+    assert strat.chunk("") == []
+    assert strat.chunk("   \n\t") == []
+
+
+def test_text_shorter_than_chunk_size_results_in_single_chunk():
+    # Use a small chunk size for a deterministic test
+    strat = TokenChunkingStrategy(chunk_size=4, overlap=2)
+    text = "Hello world"  # two tokens in typical tokenization
+    chunks = strat.chunk(text)
+    assert isinstance(chunks, list)
+    assert len(chunks) == 1
+    assert chunks[0] == text
+
+
+def test_text_longer_produces_multiple_chunks():
+    # Build a long sequence by repeating a simple token to ensure > chunk_size tokens
+    long_text = ("word " * 1100).strip()
+    strat = TokenChunkingStrategy(chunk_size=1000, overlap=200)
+    chunks = strat.chunk(long_text)
+    assert isinstance(chunks, list)
+    assert len(chunks) >= 2
+    # Ensure chunks are non-empty and that the transformation round-trips for the first chunk
+    assert all(isinstance(c, str) for c in chunks)
+    assert all(len(c) > 0 for c in chunks)
diff --git a/backend/app/test/test_phase1_ingest.py b/backend/app/test/test_phase1_ingest.py
index 324da44..9ab4cce 100644
--- a/backend/app/test/test_phase1_ingest.py
+++ b/backend/app/test/test_phase1_ingest.py
@@ -7,23 +7,97 @@ Covers:
 - Error handling for unsupported file types
 """
 import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import MagicMock, patch
 
 
 class TestIngest:
     """Document upload and ChromaDB ingestion tests."""
 
-    def test_ingest_pdf_success(self):
+    @pytest.fixture
+    def client(self):
+        """Create test client with mocked dependencies."""
+        from app.main import app
+        return TestClient(app)
+
+    def test_ingest_pdf_success(self, client, tmp_path):
         """Should ingest PDF and return document ID with metadata."""
-        pass  # TODO: implement
+        import io
 
-    def test_ingest_txt_success(self):
-        """Should ingest plain text and chunk correctly."""
-        pass  # TODO: implement
+        with patch("app.services.rag.RAGService") as mock_rag_class:
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = "doc-123"
+            mock_rag_class.return_value = mock_rag
 
-    def test_ingest_metadata_extraction(self):
-        """Should extract filename, upload_date, content_summary."""
-        pass  # TODO: implement
+            with patch("app.utils.pdf_parser.parse_pdf") as mock_parse:
+                mock_parse.return_value = "Parsed PDF text content"
 
-    def test_ingest_unsupported_format(self):
+                with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
+                    mock_chunker = MagicMock()
+                    mock_chunker.chunk.return_value = ["chunk 1", "chunk 2"]
+                    mock_chunk_class.return_value = mock_chunker
+
+                    with patch("app.utils.metadata.extract_metadata") as mock_meta:
+                        mock_meta.return_value = [
+                            {"filename": "test.pdf", "chunk_index": 0},
+                            {"filename": "test.pdf", "chunk_index": 1},
+                        ]
+
+                        response = client.post(
+                            "/api/v1/ingest",
+                            files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
+                        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert "document_id" in data
+        assert data["chunk_count"] == 2
+        assert data["filename"] == "test.pdf"
+
+    def test_ingest_docx_success(self, client, tmp_path):
+        """Should ingest DOCX and return document ID with metadata."""
+        import io
+
+        with patch("app.services.rag.RAGService") as mock_rag_class:
+            mock_rag = MagicMock()
+            mock_rag.ingest_document.return_value = "doc-456"
+            mock_rag_class.return_value = mock_rag
+
+            with patch("app.utils.docx_parser.parse_docx") as mock_parse:
+                mock_parse.return_value = "Parsed DOCX text content"
+
+                with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
+                    mock_chunker = MagicMock()
+                    mock_chunker.chunk.return_value = ["chunk 1"]
+                    mock_chunk_class.return_value = mock_chunker
+
+                    with patch("app.utils.metadata.extract_metadata") as mock_meta:
+                        mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
+
+                        response = client.post(
+                            "/api/v1/ingest",
+                            files={"file": ("test.docx", io.BytesIO(b"docx content"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
+                        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["chunk_count"] == 1
+        assert data["filename"] == "test.docx"
+
+    def test_ingest_unsupported_format(self, client):
         """Should reject unsupported file formats."""
-        pass  # TODO: implement
+        import io
+
+        response = client.post(
+            "/api/v1/ingest",
+            files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
+        )
+
+        assert response.status_code == 400
+        assert "unsupported" in response.json()["detail"].lower()
+
+    def test_ingest_no_file(self, client):
+        """Should reject request without file."""
+        response = client.post("/api/v1/ingest")
+
+        assert response.status_code == 422
diff --git a/backend/app/test/test_phase1_metadata.py b/backend/app/test/test_phase1_metadata.py
index 8dc7c38..3fb4d30 100644
--- a/backend/app/test/test_phase1_metadata.py
+++ b/backend/app/test/test_phase1_metadata.py
@@ -1,25 +1,67 @@
-"""Phase 1 tests: Metadata extraction utilities.
+import re
+from pathlib import Path
+from datetime import datetime
 
-Covers:
-- Filename extraction
-- Upload date generation
-- Content summary generation
-- Metadata schema validation
-"""
 import pytest
+import sys
+from pathlib import Path
+import importlib.util
 
 
-class TestMetadata:
-    """Metadata extraction utility tests."""
+# Dynamically load the metadata extractor to avoid package-path import issues
+# The module lives at backend/app/utils/metadata.py relative to this test file.
+MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py"
+spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH))
+metadata_module = importlib.util.module_from_spec(spec)  # type: ignore
+assert spec is not None and spec.loader is not None
+spec.loader.exec_module(metadata_module)  # type: ignore
+extract_metadata = getattr(metadata_module, "extract_metadata")
 
-    def test_extract_filename(self):
-        """Should extract clean filename from path."""
-        pass  # TODO: implement
 
-    def test_generate_upload_date(self):
-        """Should generate ISO format upload date."""
-        pass  # TODO: implement
+def _is_iso8601(s: str) -> bool:
+    try:
+        datetime.fromisoformat(s)
+        return True
+    except ValueError:
+        return False
 
-    def test_content_summary(self):
-        """Should generate concise content summary."""
-        pass  # TODO: implement
+
+def test_extract_metadata_basic(tmp_path):
+    # Prepare a dummy file path that exists
+    dummy_file = tmp_path / "dir with spaces" / "sample.txt"
+    dummy_file.parent.mkdir(parents=True, exist_ok=True)
+    dummy_file.write_text("content")
+
+    chunks = ["a" * 250, "short"]
+
+    metadata = extract_metadata(str(dummy_file), chunks)
+
+    assert isinstance(metadata, list)
+    assert len(metadata) == 2
+
+    # First chunk
+    m0 = metadata[0]
+    assert m0["filename"] == "sample.txt"
+    assert m0["chunk_index"] == 0
+    assert m0["upload_date"] is not None
+    assert _is_iso8601(m0["upload_date"])
+    assert m0["content_summary"] == "a" * 200
+
+    # Second chunk
+    m1 = metadata[1]
+    assert m1["filename"] == "sample.txt"
+    assert m1["chunk_index"] == 1
+    assert m1["content_summary"] == "short"
+
+
+def test_extract_metadata_empty_chunks(tmp_path):
+    dummy_file = tmp_path / "file.txt"
+    dummy_file.write_text("data")
+    metadata = extract_metadata(str(dummy_file), [])
+    assert metadata == []
+
+
+def test_extract_metadata_missing_file_raises(tmp_path):
+    missing = tmp_path / "nonexistent" / "nofile.txt"
+    with pytest.raises(FileNotFoundError):
+        extract_metadata(str(missing), ["data"])
diff --git a/backend/app/test/test_phase1_parsers.py b/backend/app/test/test_phase1_parsers.py
new file mode 100644
index 0000000..27c6aae
--- /dev/null
+++ b/backend/app/test/test_phase1_parsers.py
@@ -0,0 +1,67 @@
+"""Phase 1.2: Document parsers tests (DOCX and PDF)."""
+import os
+from pathlib import Path
+
+import pytest
+ # python-docx may not be installed in all environments. Skip DOCX tests if unavailable.
+
+
+def test_parse_docx_basic(tmp_path):
+    # Dynamically create a minimal DOCX with two paragraphs
+    doc_path = tmp_path / "sample.docx"
+    try:
+        from docx import Document as Doc
+        doc = Doc()
+    except Exception:
+        pytest.skip("python-docx not installed, skipping DOCX tests")
+    doc.add_paragraph("Hello")
+    doc.add_paragraph("World")
+    doc.save(str(doc_path))
+
+    # Import here to avoid test import side effects
+    from app.utils.docx_parser import parse_docx
+
+    text = parse_docx(str(doc_path))
+    assert text == "Hello\nWorld"
+
+
+def test_parse_docx_empty(tmp_path):
+    doc_path = tmp_path / "empty.docx"
+    try:
+        from docx import Document as Doc
+        doc = Doc()
+    except Exception:
+        pytest.skip("python-docx not installed, skipping DOCX tests")
+    doc.save(str(doc_path))
+
+    from app.utils.docx_parser import parse_docx
+    text = parse_docx(str(doc_path))
+    assert text == ""
+
+
+def test_parse_docx_corrupted(tmp_path):
+    # Create a file with DOCX extension but invalid content
+    corrupted_path = tmp_path / "corrupted.docx"
+    corrupted_path.write_bytes(b"not a real docx content")
+    from app.utils.docx_parser import parse_docx
+    with pytest.raises(ValueError):
+        parse_docx(str(corrupted_path))
+
+
+def test_parse_pdf_empty(tmp_path):
+    # Create an empty (0-byte) PDF file
+    pdf_path = tmp_path / "empty.pdf"
+    pdf_path.write_bytes(b"")  # 0 bytes
+
+    from app.utils.pdf_parser import parse_pdf
+    with pytest.raises(ValueError):
+        parse_pdf(str(pdf_path))
+
+
+def test_parse_pdf_corrupted(tmp_path):
+    pdf_path = tmp_path / "corrupted.pdf"
+    pdf_path.write_bytes(b"not a pdf content")
+
+    from app.utils.pdf_parser import parse_pdf
+    with pytest.raises(ValueError):
+        parse_pdf(str(pdf_path))
diff --git a/backend/app/test/test_phase1_rag_service.py b/backend/app/test/test_phase1_rag_service.py
index cd31250..9827809 100644
--- a/backend/app/test/test_phase1_rag_service.py
+++ b/backend/app/test/test_phase1_rag_service.py
@@ -1,25 +1,137 @@
 """Phase 1 tests: RAG service logic.
 
 Covers:
-- ChromaDB retrieval with Qwen embeddings
-- Context assembly for LLM prompt
-- Strict prompt construction (answer ONLY from retrieved context)
+- ChromaDB document ingestion with metadata
+- Retrieval with query keywords
+- Response generation with strict RAG prompt
 - Metadata handling per chunk
 """
 import pytest
+from unittest.mock import MagicMock, patch
 
 
 class TestRAGService:
     """RAG retrieval and prompt logic tests."""
 
-    def test_retrieve_relevant_chunks(self):
-        """Should retrieve semantically relevant chunks from ChromaDB."""
-        pass  # TODO: implement
+    def test_ingest_document_adds_chunks(self):
+        """Should add chunks with metadata to ChromaDB collection."""
+        from app.services.rag import RAGService
 
-    def test_strict_prompt_format(self):
-        """Should construct prompt forbidding external knowledge."""
-        pass  # TODO: implement
+        mock_collection = MagicMock()
+        mock_client = MagicMock()
+        mock_client.get_or_create_collection.return_value = mock_collection
 
-    def test_chunk_metadata_preserved(self):
-        """Should preserve filename, upload_date, content_summary per chunk."""
-        pass  # TODO: implement
+        service = RAGService(chroma_client=mock_client)
+
+        chunks = ["chunk one", "chunk two"]
+        metadata = [
+            {"filename": "test.txt", "upload_date": "2024-01-01", "content_summary": "summary 1", "chunk_index": 0},
+            {"filename": "test.txt", "upload_date": "2024-01-01", "content_summary": "summary 2", "chunk_index": 1},
+        ]
+
+        service.ingest_document("test.txt", chunks, metadata)
+
+        mock_client.get_or_create_collection.assert_called_once_with(name="documents")
+        mock_collection.add.assert_called_once()
+        call_args = mock_collection.add.call_args[1]
+        assert len(call_args["documents"]) == 2
+        assert call_args["documents"] == chunks
+        assert len(call_args["metadatas"]) == 2
+        assert call_args["metadatas"] == metadata
+        assert len(call_args["ids"]) == 2
+
+    def test_ingest_document_empty_chunks(self):
+        """Should not call ChromaDB when chunks list is empty."""
+        from app.services.rag import RAGService
+
+        mock_collection = MagicMock()
+        mock_client = MagicMock()
+        mock_client.get_or_create_collection.return_value = mock_collection
+
+        service = RAGService(chroma_client=mock_client)
+        service.ingest_document("test.txt", [], [])
+
+        mock_collection.add.assert_not_called()
+
+    def test_retrieve_returns_chunks(self):
+        """Should retrieve chunks and metadata from ChromaDB."""
+        from app.services.rag import RAGService
+
+        mock_collection = MagicMock()
+        mock_client = MagicMock()
+        mock_client.get_or_create_collection.return_value = mock_collection
+
+        mock_collection.query.return_value = {
+            "documents": [["chunk one", "chunk two"]],
+            "metadatas": [[{"filename": "test.txt"}, {"filename": "test.txt"}]],
+            "distances": [[0.1, 0.2]],
+        }
+
+        service = RAGService(chroma_client=mock_client)
+        results = service.retrieve(["query", "keywords"], n_results=5)
+
+        mock_collection.query.assert_called_once()
+        call_args = mock_collection.query.call_args[1]
+        assert call_args["n_results"] == 5
+        assert len(results) == 2
+        assert results[0] == ("chunk one", {"filename": "test.txt"}, 0.1)
+        assert results[1] == ("chunk two", {"filename": "test.txt"}, 0.2)
+
+    def test_retrieve_no_results(self):
+        """Should return empty list when no results found."""
+        from app.services.rag import RAGService
+
+        mock_collection = MagicMock()
+        mock_client = MagicMock()
+        mock_client.get_or_create_collection.return_value = mock_collection
+
+        mock_collection.query.return_value = {
+            "documents": [[]],
+            "metadatas": [[]],
+            "distances": [[]],
+        }
+
+        service = RAGService(chroma_client=mock_client)
+        results = service.retrieve(["query"])
+
+        assert results == []
+
+    def test_generate_response_calls_llm(self):
+        """Should call LLM with strict RAG prompt."""
+        from app.services.rag import RAGService
+
+        mock_collection = MagicMock()
+        mock_client = MagicMock()
+        mock_client.get_or_create_collection.return_value = mock_collection
+
+        mock_llm = MagicMock()
+        mock_llm.complete.return_value = "- Bullet point answer"
+
+        service = RAGService(chroma_client=mock_client, llm_client=mock_llm)
+
+        chunks = ["relevant chunk"]
+        metadata = [{"filename": "test.txt", "content_summary": "summary"}]
+
+        answer = service.generate_response("What is this?", chunks, metadata)
+
+        mock_llm.complete.assert_called_once()
+        prompt = mock_llm.complete.call_args[1]["prompt"]
+        assert "What is this?" in prompt
+        assert "relevant chunk" in prompt
+        assert "test.txt" in prompt
+        assert "only these document chunks" in prompt.lower()
+        assert answer == "- Bullet point answer"
+
+    def test_generate_response_no_chunks(self):
+        """Should return fallback message when no chunks provided."""
+        from app.services.rag import RAGService
+
+        mock_collection = MagicMock()
+        mock_client = MagicMock()
+        mock_client.get_or_create_collection.return_value = mock_collection
+
+        service = RAGService(chroma_client=mock_client, llm_client=MagicMock())
+
+        answer = service.generate_response("What is this?", [], [])
+
+        assert "no relevant" in answer.lower() or "could not find" in answer.lower()
diff --git a/backend/app/utils/chunking.py b/backend/app/utils/chunking.py
new file mode 100644
index 0000000..30ce8dc
--- /dev/null
+++ b/backend/app/utils/chunking.py
@@ -0,0 +1,73 @@
+"""Chunking utilities for Phase 1.2.
+
+Provides an abstract ChunkingStrategy and a concrete
+TokenChunkingStrategy that uses tiktoken to chunk text into
+token-based windows.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class ChunkingStrategy(ABC):
+    """Abstract base class for text chunking strategies."""
+
+    @abstractmethod
+    def chunk(self, text: str) -> List[str]:
+        """Split text into a list of chunks (strings).
+
+        Implementations should return an empty list for empty or whitespace-only
+        input. The output chunks should be non-overlapping in terms of the produced
+        sequence when considering the token boundaries, but may overlap in raw text
+        due to token-based windowing.
+        """
+        raise NotImplementedError
+
+
+class TokenChunkingStrategy(ChunkingStrategy):
+    """Chunk text by token windows using the tiktoken encoder.
+
+    The strategy operates on token counts: each chunk contains up to
+    chunk_size tokens with overlap of overlap tokens between consecutive chunks.
+    """
+
+    def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"):
+        if chunk_size <= 0:
+            raise ValueError("chunk_size must be positive")
+        if overlap < 0:
+            raise ValueError("overlap must be non-negative")
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+        # Lazy import to avoid import-time penalties in environments without tokenizers
+        import tiktoken
+
+        self._encoding = tiktoken.get_encoding(encoding_name)
+
+    def chunk(self, text: str) -> List[str]:
+        if not isinstance(text, str):
+            raise TypeError("text must be a string")
+        if text.strip() == "":
+            return []
+
+        # Tokenize the input text
+        tokens = self._encoding.encode(text)
+        if not tokens:
+            return []
+
+        chunks: List[str] = []
+        step = self.chunk_size - self.overlap
+        if step <= 0:
+            step = 1  # ensure progress even with extreme overlap
+
+        for i in range(0, len(tokens), step):
+            segment = tokens[i : i + self.chunk_size]
+            if not segment:
+                break
+            chunk_text = self._encoding.decode(segment)
+            chunks.append(chunk_text)
+            # If we reached the end of the token array, break early
+            if len(segment) < self.chunk_size:
+                break
+
+        return chunks
diff --git a/backend/app/utils/docx_parser.py b/backend/app/utils/docx_parser.py
new file mode 100644
index 0000000..9e06804
--- /dev/null
+++ b/backend/app/utils/docx_parser.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from typing import Optional
+Document = None
+
+
+def _ensure_docx_imported():
+    global Document
+    if Document is None:
+        try:
+            from docx import Document as _Doc  # type: ignore
+            Document = _Doc
+        except Exception as exc:  # pragma: no cover - missing optional dep
+            raise ValueError("DOCX library is not installed") from exc
+
+
+def parse_docx(file_path: str) -> str:
+    """Parse a DOCX file and return its text content.
+
+    The function preserves paragraph breaks by inserting a newline between
+    paragraphs. Empty documents yield an empty string.
+
+    Raises:
+        ValueError: If the file is not a valid DOCX document or cannot be read.
+    """
+    try:
+        _ensure_docx_imported()
+        assert Document is not None
+        doc = Document(file_path)
+    except Exception as exc:  # pragma: no cover - surface invalid DOCX
+        raise ValueError(f"Invalid DOCX file: {exc}") from exc
+
+    paragraphs = [para.text for para in doc.paragraphs if para.text is not None]
+    # Join with newline to preserve paragraph breaks
+    return "\n".join(paragraphs).strip()
diff --git a/backend/app/utils/metadata.py b/backend/app/utils/metadata.py
new file mode 100644
index 0000000..1bef64c
--- /dev/null
+++ b/backend/app/utils/metadata.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import os
+from datetime import datetime
+from typing import List, Dict, Any
+
+
+def extract_metadata(file_path: str, chunks: List[str]) -> List[Dict[str, Any]]:
+    """Extract metadata for a list of text chunks.
+
+    For each chunk, create a metadata dictionary containing:
+    - filename: basename of the provided file_path
+    - upload_date: ISO 8601 timestamp of when metadata was generated
+    - content_summary: first 200 characters of the chunk (or full chunk if shorter)
+    - chunk_index: 0-based index of the chunk
+
+    Args:
+        file_path: Path to the file associated with the chunks.
+        chunks: List of string chunks to generate metadata for.
+
+    Returns:
+        A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list.
+
+    Raises:
+        FileNotFoundError: If the provided file_path does not exist.
+    """
+
+    # Edge case: no chunks to metadataize
+    if not chunks:
+        return []
+
+    # Validate file existence up-front to follow the edge-case requirements
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    filename = os.path.basename(file_path)
+    upload_date = datetime.now().isoformat()
+
+    metadata: List[Dict[str, Any]] = []
+    for idx, chunk in enumerate(chunks):
+        # Ensure we always have a string for summary extraction
+        text = chunk if isinstance(chunk, str) else ""
+        content_summary = text[:200]
+        metadata.append(
+            {
+                "filename": filename,
+                "upload_date": upload_date,
+                "content_summary": content_summary,
+                "chunk_index": idx,
+            }
+        )
+
+    return metadata
diff --git a/backend/app/utils/pdf_parser.py b/backend/app/utils/pdf_parser.py
new file mode 100644
index 0000000..86908c2
--- /dev/null
+++ b/backend/app/utils/pdf_parser.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from typing import Optional
+from pypdf import PdfReader
+
+
+def parse_pdf(file_path: str) -> str:
+    """Parse a PDF file and return its text content.
+
+    Text is collected from each page and concatenated with newlines between pages.
+    Empty PDFs or corrupted files raise ValueError.
+    """
+    try:
+        reader = PdfReader(file_path)
+    except Exception as exc:
+        raise ValueError(f"Invalid PDF file: {exc}") from exc
+
+    texts = []
+    try:
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                # Normalize line endings and trim unrelated whitespace
+                texts.append(text.strip())
+    except Exception as exc:
+        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
+
+    return "\n".join(texts).strip()
diff --git a/backend/pytest.ini b/backend/pytest.ini
new file mode 100644
index 0000000..daf5832
--- /dev/null
+++ b/backend/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+markers =
+    acceptance: Acceptance tests with real external services (LLM, ASR, ChromaDB)
+    slow: Tests that take longer than 1 second
+    asyncio_mode = auto