feat(backend): add page_number, chunk_file_path, document_id to chunk metadata
Enhance extract_metadata() with three new optional fields for page-aware chunking support. Validates list length mismatches. Fully backward compatible — existing callers unaffected. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
0995c685fa
commit
b97264c66a
|
|
@ -0,0 +1,188 @@
|
||||||
|
"""Phase 1.5.4: Enhanced metadata tests.
|
||||||
|
|
||||||
|
Tests for extract_metadata() with new page_number, chunk_file_path,
|
||||||
|
and document_id fields. These fields are optional and maintain backward
|
||||||
|
compatibility with existing callers.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
|
||||||
|
# Dynamically load the metadata module (same pattern as test_phase1_metadata.py).
|
||||||
|
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py"
|
||||||
|
spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH))
|
||||||
|
assert spec is not None and spec.loader is not None
|
||||||
|
metadata_module = importlib.util.module_from_spec(spec) # type: ignore
|
||||||
|
spec.loader.exec_module(metadata_module) # type: ignore
|
||||||
|
extract_metadata = getattr(metadata_module, "extract_metadata")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_iso8601(s: str) -> bool:
|
||||||
|
try:
|
||||||
|
datetime.fromisoformat(s)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ── Test cases ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_with_page_numbers(tmp_path):
|
||||||
|
"""Each chunk should receive the corresponding page_number."""
|
||||||
|
dummy_file = tmp_path / "doc.pdf"
|
||||||
|
dummy_file.write_text("pdf content")
|
||||||
|
|
||||||
|
chunks = ["chunk A", "chunk B", "chunk C"]
|
||||||
|
page_numbers = [1, 2, 3]
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
str(dummy_file), chunks, page_numbers=page_numbers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(metadata) == 3
|
||||||
|
for i, m in enumerate(metadata):
|
||||||
|
assert m["page_number"] == page_numbers[i]
|
||||||
|
# Existing fields must still be present
|
||||||
|
assert "filename" in m
|
||||||
|
assert "chunk_index" in m
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_with_chunk_file_paths(tmp_path):
|
||||||
|
"""Each chunk should receive the corresponding chunk_file_path."""
|
||||||
|
dummy_file = tmp_path / "report.pdf"
|
||||||
|
dummy_file.write_text("report data")
|
||||||
|
|
||||||
|
chunks = ["page-1 text", "page-2 text"]
|
||||||
|
chunk_file_paths = ["report_page_1.pdf", "report_page_2.pdf"]
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
str(dummy_file), chunks, chunk_file_paths=chunk_file_paths
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(metadata) == 2
|
||||||
|
for i, m in enumerate(metadata):
|
||||||
|
assert m["chunk_file_path"] == chunk_file_paths[i]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_with_document_id(tmp_path):
|
||||||
|
"""All metadata dicts should contain the same document_id."""
|
||||||
|
dummy_file = tmp_path / "memo.docx"
|
||||||
|
dummy_file.write_text("memo")
|
||||||
|
|
||||||
|
chunks = ["section A", "section B"]
|
||||||
|
document_id = "test-uuid-123"
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
str(dummy_file), chunks, document_id=document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(metadata) == 2
|
||||||
|
for m in metadata:
|
||||||
|
assert m["document_id"] == "test-uuid-123"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_all_new_fields(tmp_path):
|
||||||
|
"""Pass all three new params together and verify complete structure."""
|
||||||
|
dummy_file = tmp_path / "full.pdf"
|
||||||
|
dummy_file.write_text("full doc")
|
||||||
|
|
||||||
|
chunks = ["alpha", "beta", "gamma"]
|
||||||
|
page_numbers = [5, 6, 7]
|
||||||
|
chunk_file_paths = ["full_p5.txt", "full_p6.txt", "full_p7.txt"]
|
||||||
|
document_id = "uuid-all-fields"
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
str(dummy_file),
|
||||||
|
chunks,
|
||||||
|
page_numbers=page_numbers,
|
||||||
|
chunk_file_paths=chunk_file_paths,
|
||||||
|
document_id=document_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(metadata) == 3
|
||||||
|
|
||||||
|
for i, m in enumerate(metadata):
|
||||||
|
# New fields
|
||||||
|
assert m["page_number"] == page_numbers[i]
|
||||||
|
assert m["chunk_file_path"] == chunk_file_paths[i]
|
||||||
|
assert m["document_id"] == document_id
|
||||||
|
|
||||||
|
# Old fields must still be present and correct
|
||||||
|
assert m["filename"] == "full.pdf"
|
||||||
|
assert m["chunk_index"] == i
|
||||||
|
assert _is_iso8601(m["upload_date"])
|
||||||
|
assert m["content_summary"] == chunks[i]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_without_new_fields(tmp_path):
|
||||||
|
"""Calling without any new params must be backward-compatible.
|
||||||
|
|
||||||
|
The new fields (page_number, chunk_file_path, document_id) should not
|
||||||
|
appear in the metadata dicts when not explicitly requested.
|
||||||
|
"""
|
||||||
|
dummy_file = tmp_path / "legacy.txt"
|
||||||
|
dummy_file.write_text("legacy")
|
||||||
|
|
||||||
|
chunks = ["old-style chunk"]
|
||||||
|
|
||||||
|
metadata = extract_metadata(str(dummy_file), chunks)
|
||||||
|
|
||||||
|
assert len(metadata) == 1
|
||||||
|
m = metadata[0]
|
||||||
|
|
||||||
|
# New fields should be absent (or None — implementation choice).
|
||||||
|
# Following the plan: these fields are optional additions.
|
||||||
|
assert m.get("page_number") is None
|
||||||
|
assert m.get("chunk_file_path") is None
|
||||||
|
assert m.get("document_id") is None
|
||||||
|
|
||||||
|
# Core fields intact
|
||||||
|
assert m["filename"] == "legacy.txt"
|
||||||
|
assert m["chunk_index"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_page_numbers_length_mismatch(tmp_path):
|
||||||
|
"""If page_numbers length != chunks length, should raise ValueError."""
|
||||||
|
dummy_file = tmp_path / "mismatch.pdf"
|
||||||
|
dummy_file.write_text("data")
|
||||||
|
|
||||||
|
chunks = ["a", "b", "c"]
|
||||||
|
page_numbers = [1, 2] # Only 2 for 3 chunks
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="page_numbers"):
|
||||||
|
extract_metadata(str(dummy_file), chunks, page_numbers=page_numbers)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_chunk_file_paths_length_mismatch(tmp_path):
|
||||||
|
"""If chunk_file_paths length != chunks length, should raise ValueError."""
|
||||||
|
dummy_file = tmp_path / "mismatch2.pdf"
|
||||||
|
dummy_file.write_text("data")
|
||||||
|
|
||||||
|
chunks = ["x", "y"]
|
||||||
|
chunk_file_paths = ["only_one.pdf"] # 1 for 2 chunks
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="chunk_file_paths"):
|
||||||
|
extract_metadata(
|
||||||
|
str(dummy_file), chunks, chunk_file_paths=chunk_file_paths
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_metadata_page_numbers_none_in_list(tmp_path):
|
||||||
|
"""page_numbers can contain None for chunks without page info (e.g. DOCX)."""
|
||||||
|
dummy_file = tmp_path / "mixed.docx"
|
||||||
|
dummy_file.write_text("docx content")
|
||||||
|
|
||||||
|
chunks = ["cover page", "body text"]
|
||||||
|
page_numbers = [None, 1]
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
str(dummy_file), chunks, page_numbers=page_numbers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(metadata) == 2
|
||||||
|
assert metadata[0]["page_number"] is None
|
||||||
|
assert metadata[1]["page_number"] == 1
|
||||||
|
|
@ -2,54 +2,75 @@ from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
|
||||||
def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]:
|
def extract_metadata(
|
||||||
|
file_path: str,
|
||||||
|
chunks: List[str],
|
||||||
|
original_filename: str | None = None,
|
||||||
|
page_numbers: List[int | None] | None = None,
|
||||||
|
chunk_file_paths: List[str | None] | None = None,
|
||||||
|
document_id: str | None = None,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
"""Extract metadata for a list of text chunks.
|
"""Extract metadata for a list of text chunks.
|
||||||
|
|
||||||
For each chunk, create a metadata dictionary containing:
|
Core fields (always present):
|
||||||
- filename: basename of the provided file_path (or original_filename if provided)
|
- filename, upload_date, content_summary, chunk_index
|
||||||
- upload_date: ISO 8601 timestamp of when metadata was generated
|
|
||||||
- content_summary: first 200 characters of the chunk (or full chunk if shorter)
|
Optional fields (present when provided, None otherwise):
|
||||||
- chunk_index: 0-based index of the chunk
|
- page_number: page source for the chunk (int or None for non-paginated docs)
|
||||||
|
- chunk_file_path: path to the per-chunk source file
|
||||||
|
- document_id: unique identifier linking all chunks to the same document
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file associated with the chunks.
|
file_path: Path to the file associated with the chunks.
|
||||||
chunks: List of string chunks to generate metadata for.
|
chunks: List of string chunks to generate metadata for.
|
||||||
original_filename: Override filename stored in metadata when file_path
|
original_filename: Override filename stored in metadata when file_path
|
||||||
is a temp file.
|
is a temp file.
|
||||||
|
page_numbers: Optional per-chunk page numbers. Length must match chunks.
|
||||||
|
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
|
||||||
|
document_id: Optional unique document identifier applied to all chunks.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list.
|
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
FileNotFoundError: If the provided file_path does not exist.
|
FileNotFoundError: If file_path does not exist.
|
||||||
|
ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Edge case: no chunks to metadataize
|
|
||||||
if not chunks:
|
if not chunks:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Validate file existence up-front to follow the edge-case requirements
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
raise FileNotFoundError(f"File not found: {file_path}")
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
|
||||||
|
if page_numbers is not None and len(page_numbers) != len(chunks):
|
||||||
|
raise ValueError(
|
||||||
|
f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
|
||||||
|
raise ValueError(
|
||||||
|
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
|
||||||
|
)
|
||||||
|
|
||||||
filename = original_filename if original_filename else os.path.basename(file_path)
|
filename = original_filename if original_filename else os.path.basename(file_path)
|
||||||
upload_date = datetime.now().isoformat()
|
upload_date = datetime.now().isoformat()
|
||||||
|
|
||||||
metadata: List[Dict[str, Any]] = []
|
metadata: List[Dict[str, Any]] = []
|
||||||
for idx, chunk in enumerate(chunks):
|
for idx, chunk in enumerate(chunks):
|
||||||
# Ensure we always have a string for summary extraction
|
|
||||||
text = chunk if isinstance(chunk, str) else ""
|
text = chunk if isinstance(chunk, str) else ""
|
||||||
content_summary = text[:200]
|
content_summary = text[:200]
|
||||||
metadata.append(
|
entry: Dict[str, Any] = {
|
||||||
{
|
"filename": filename,
|
||||||
"filename": filename,
|
"upload_date": upload_date,
|
||||||
"upload_date": upload_date,
|
"content_summary": content_summary,
|
||||||
"content_summary": content_summary,
|
"chunk_index": idx,
|
||||||
"chunk_index": idx,
|
"page_number": page_numbers[idx] if page_numbers else None,
|
||||||
}
|
"chunk_file_path": chunk_file_paths[idx] if chunk_file_paths else None,
|
||||||
)
|
"document_id": document_id,
|
||||||
|
}
|
||||||
|
metadata.append(entry)
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue