From b97264c66a38e655b0662f30ca829cbf189313ae Mon Sep 17 00:00:00 2001 From: Woody Date: Fri, 24 Apr 2026 10:30:40 +0800 Subject: [PATCH] feat(backend): add page_number, chunk_file_path, document_id to chunk metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance extract_metadata() with three new optional fields for page-aware chunking support. Validates list length mismatches. Fully backward compatible — existing callers unaffected. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- .../app/test/test_phase1_enhanced_metadata.py | 188 ++++++++++++++++++ backend/app/utils/metadata.py | 63 ++++-- 2 files changed, 230 insertions(+), 21 deletions(-) create mode 100644 backend/app/test/test_phase1_enhanced_metadata.py diff --git a/backend/app/test/test_phase1_enhanced_metadata.py b/backend/app/test/test_phase1_enhanced_metadata.py new file mode 100644 index 0000000..a20567b --- /dev/null +++ b/backend/app/test/test_phase1_enhanced_metadata.py @@ -0,0 +1,188 @@ +"""Phase 1.5.4: Enhanced metadata tests. + +Tests for extract_metadata() with new page_number, chunk_file_path, +and document_id fields. These fields are optional and maintain backward +compatibility with existing callers. +""" +from pathlib import Path +from datetime import datetime + +import pytest +import importlib.util + + +# Dynamically load the metadata module (same pattern as test_phase1_metadata.py). +MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py" +spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH)) +assert spec is not None and spec.loader is not None +metadata_module = importlib.util.module_from_spec(spec) # type: ignore +spec.loader.exec_module(metadata_module) # type: ignore +extract_metadata = getattr(metadata_module, "extract_metadata") + + +def _is_iso8601(s: str) -> bool: + try: + datetime.fromisoformat(s) + return True + except ValueError: + return False + + +# ── Test cases ────────────────────────────────────────────────────────── + + +def test_extract_metadata_with_page_numbers(tmp_path): + """Each chunk should receive the corresponding page_number.""" + dummy_file = tmp_path / "doc.pdf" + dummy_file.write_text("pdf content") + + chunks = ["chunk A", "chunk B", "chunk C"] + page_numbers = [1, 2, 3] + + metadata = extract_metadata( + str(dummy_file), chunks, page_numbers=page_numbers + ) + + assert len(metadata) == 3 + for i, m in enumerate(metadata): + assert m["page_number"] == page_numbers[i] + # Existing fields must still be present + assert "filename" in m + assert "chunk_index" in m + + +def test_extract_metadata_with_chunk_file_paths(tmp_path): + """Each chunk should receive the corresponding chunk_file_path.""" + dummy_file = tmp_path / "report.pdf" + dummy_file.write_text("report data") + + chunks = ["page-1 text", "page-2 text"] + chunk_file_paths = ["report_page_1.pdf", "report_page_2.pdf"] + + metadata = extract_metadata( + str(dummy_file), chunks, chunk_file_paths=chunk_file_paths + ) + + assert len(metadata) == 2 + for i, m in enumerate(metadata): + assert m["chunk_file_path"] == chunk_file_paths[i] + + +def test_extract_metadata_with_document_id(tmp_path): + """All metadata dicts should contain the same document_id.""" + dummy_file = tmp_path / "memo.docx" + dummy_file.write_text("memo") + + chunks = ["section A", "section B"] + document_id = "test-uuid-123" + + metadata = extract_metadata( + str(dummy_file), chunks, document_id=document_id + ) + + assert len(metadata) == 2 + for m in metadata: + assert m["document_id"] == "test-uuid-123" + + +def test_extract_metadata_all_new_fields(tmp_path): + """Pass all three new params together and verify complete structure.""" + dummy_file = tmp_path / "full.pdf" + dummy_file.write_text("full doc") + + chunks = ["alpha", "beta", "gamma"] + page_numbers = [5, 6, 7] + chunk_file_paths = ["full_p5.txt", "full_p6.txt", "full_p7.txt"] + document_id = "uuid-all-fields" + + metadata = extract_metadata( + str(dummy_file), + chunks, + page_numbers=page_numbers, + chunk_file_paths=chunk_file_paths, + document_id=document_id, + ) + + assert len(metadata) == 3 + + for i, m in enumerate(metadata): + # New fields + assert m["page_number"] == page_numbers[i] + assert m["chunk_file_path"] == chunk_file_paths[i] + assert m["document_id"] == document_id + + # Old fields must still be present and correct + assert m["filename"] == "full.pdf" + assert m["chunk_index"] == i + assert _is_iso8601(m["upload_date"]) + assert m["content_summary"] == chunks[i] + + +def test_extract_metadata_without_new_fields(tmp_path): + """Calling without any new params must be backward-compatible. + + The new fields (page_number, chunk_file_path, document_id) should not + appear in the metadata dicts when not explicitly requested. + """ + dummy_file = tmp_path / "legacy.txt" + dummy_file.write_text("legacy") + + chunks = ["old-style chunk"] + + metadata = extract_metadata(str(dummy_file), chunks) + + assert len(metadata) == 1 + m = metadata[0] + + # New fields should be absent (or None — implementation choice). + # Following the plan: these fields are optional additions. + assert m.get("page_number") is None + assert m.get("chunk_file_path") is None + assert m.get("document_id") is None + + # Core fields intact + assert m["filename"] == "legacy.txt" + assert m["chunk_index"] == 0 + + +def test_extract_metadata_page_numbers_length_mismatch(tmp_path): + """If page_numbers length != chunks length, should raise ValueError.""" + dummy_file = tmp_path / "mismatch.pdf" + dummy_file.write_text("data") + + chunks = ["a", "b", "c"] + page_numbers = [1, 2] # Only 2 for 3 chunks + + with pytest.raises(ValueError, match="page_numbers"): + extract_metadata(str(dummy_file), chunks, page_numbers=page_numbers) + + +def test_extract_metadata_chunk_file_paths_length_mismatch(tmp_path): + """If chunk_file_paths length != chunks length, should raise ValueError.""" + dummy_file = tmp_path / "mismatch2.pdf" + dummy_file.write_text("data") + + chunks = ["x", "y"] + chunk_file_paths = ["only_one.pdf"] # 1 for 2 chunks + + with pytest.raises(ValueError, match="chunk_file_paths"): + extract_metadata( + str(dummy_file), chunks, chunk_file_paths=chunk_file_paths + ) + + +def test_extract_metadata_page_numbers_none_in_list(tmp_path): + """page_numbers can contain None for chunks without page info (e.g. DOCX).""" + dummy_file = tmp_path / "mixed.docx" + dummy_file.write_text("docx content") + + chunks = ["cover page", "body text"] + page_numbers = [None, 1] + + metadata = extract_metadata( + str(dummy_file), chunks, page_numbers=page_numbers + ) + + assert len(metadata) == 2 + assert metadata[0]["page_number"] is None + assert metadata[1]["page_number"] == 1 diff --git a/backend/app/utils/metadata.py b/backend/app/utils/metadata.py index 4eaabb0..2b94cce 100644 --- a/backend/app/utils/metadata.py +++ b/backend/app/utils/metadata.py @@ -2,54 +2,75 @@ from __future__ import annotations import os from datetime import datetime -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional -def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]: +def extract_metadata( + file_path: str, + chunks: List[str], + original_filename: str | None = None, + page_numbers: List[int | None] | None = None, + chunk_file_paths: List[str | None] | None = None, + document_id: str | None = None, +) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. - For each chunk, create a metadata dictionary containing: - - filename: basename of the provided file_path (or original_filename if provided) - - upload_date: ISO 8601 timestamp of when metadata was generated - - content_summary: first 200 characters of the chunk (or full chunk if shorter) - - chunk_index: 0-based index of the chunk + Core fields (always present): + - filename, upload_date, content_summary, chunk_index + + Optional fields (present when provided, None otherwise): + - page_number: page source for the chunk (int or None for non-paginated docs) + - chunk_file_path: path to the per-chunk source file + - document_id: unique identifier linking all chunks to the same document Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. original_filename: Override filename stored in metadata when file_path is a temp file. + page_numbers: Optional per-chunk page numbers. Length must match chunks. + chunk_file_paths: Optional per-chunk source file paths. Length must match chunks. + document_id: Optional unique document identifier applied to all chunks. Returns: - A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list. + A list of metadata dictionaries, one per chunk. Empty list if chunks is empty. Raises: - FileNotFoundError: If the provided file_path does not exist. + FileNotFoundError: If file_path does not exist. + ValueError: If page_numbers or chunk_file_paths length mismatches chunks. """ - - # Edge case: no chunks to metadataize if not chunks: return [] - # Validate file existence up-front to follow the edge-case requirements if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") + if page_numbers is not None and len(page_numbers) != len(chunks): + raise ValueError( + f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})" + ) + + if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks): + raise ValueError( + f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})" + ) + filename = original_filename if original_filename else os.path.basename(file_path) upload_date = datetime.now().isoformat() metadata: List[Dict[str, Any]] = [] for idx, chunk in enumerate(chunks): - # Ensure we always have a string for summary extraction text = chunk if isinstance(chunk, str) else "" content_summary = text[:200] - metadata.append( - { - "filename": filename, - "upload_date": upload_date, - "content_summary": content_summary, - "chunk_index": idx, - } - ) + entry: Dict[str, Any] = { + "filename": filename, + "upload_date": upload_date, + "content_summary": content_summary, + "chunk_index": idx, + "page_number": page_numbers[idx] if page_numbers else None, + "chunk_file_path": chunk_file_paths[idx] if chunk_file_paths else None, + "document_id": document_id, + } + metadata.append(entry) return metadata