81 lines
3.0 KiB
Python
81 lines
3.0 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
|
|
def extract_metadata(
|
|
file_path: str,
|
|
chunks: List[str],
|
|
original_filename: str | None = None,
|
|
page_numbers: List[int | None] | None = None,
|
|
chunk_file_paths: List[str | None] | None = None,
|
|
document_id: str | None = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Extract metadata for a list of text chunks.
|
|
|
|
Core fields (always present):
|
|
- filename, upload_date, content_summary, chunk_index
|
|
|
|
Optional fields (present when provided, None otherwise):
|
|
- page_number: page source for the chunk (int or None for non-paginated docs)
|
|
- chunk_file_path: path to the per-chunk source file
|
|
- document_id: unique identifier linking all chunks to the same document
|
|
|
|
Args:
|
|
file_path: Path to the file associated with the chunks.
|
|
chunks: List of string chunks to generate metadata for.
|
|
original_filename: Override filename stored in metadata when file_path
|
|
is a temp file.
|
|
page_numbers: Optional per-chunk page numbers. Length must match chunks.
|
|
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
|
|
document_id: Optional unique document identifier applied to all chunks.
|
|
|
|
Returns:
|
|
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
|
|
|
|
Raises:
|
|
FileNotFoundError: If file_path does not exist.
|
|
ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
|
|
"""
|
|
if not chunks:
|
|
return []
|
|
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
if page_numbers is not None and len(page_numbers) != len(chunks):
|
|
raise ValueError(
|
|
f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
|
|
)
|
|
|
|
if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
|
|
raise ValueError(
|
|
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
|
|
)
|
|
|
|
filename = original_filename if original_filename else os.path.basename(file_path)
|
|
upload_date = datetime.now().isoformat()
|
|
|
|
metadata: List[Dict[str, Any]] = []
|
|
for idx, chunk in enumerate(chunks):
|
|
text = chunk if isinstance(chunk, str) else ""
|
|
content_summary = text[:200]
|
|
entry: Dict[str, Any] = {
|
|
"filename": filename,
|
|
"upload_date": upload_date,
|
|
"content_summary": content_summary,
|
|
"chunk_index": idx,
|
|
"document_id": document_id,
|
|
}
|
|
page_num = page_numbers[idx] if page_numbers else None
|
|
if page_num is not None:
|
|
entry["page_number"] = page_num
|
|
cfp = chunk_file_paths[idx] if chunk_file_paths else None
|
|
if cfp is not None:
|
|
entry["chunk_file_path"] = cfp
|
|
metadata.append(entry)
|
|
|
|
return metadata
|