legco_ai_assistant/backend/app/utils/metadata.py

77 lines
2.8 KiB
Python

from __future__ import annotations
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
def extract_metadata(
file_path: str,
chunks: List[str],
original_filename: str | None = None,
page_numbers: List[int | None] | None = None,
chunk_file_paths: List[str | None] | None = None,
document_id: str | None = None,
) -> List[Dict[str, Any]]:
"""Extract metadata for a list of text chunks.
Core fields (always present):
- filename, upload_date, content_summary, chunk_index
Optional fields (present when provided, None otherwise):
- page_number: page source for the chunk (int or None for non-paginated docs)
- chunk_file_path: path to the per-chunk source file
- document_id: unique identifier linking all chunks to the same document
Args:
file_path: Path to the file associated with the chunks.
chunks: List of string chunks to generate metadata for.
original_filename: Override filename stored in metadata when file_path
is a temp file.
page_numbers: Optional per-chunk page numbers. Length must match chunks.
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
document_id: Optional unique document identifier applied to all chunks.
Returns:
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
Raises:
FileNotFoundError: If file_path does not exist.
ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
"""
if not chunks:
return []
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
if page_numbers is not None and len(page_numbers) != len(chunks):
raise ValueError(
f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
)
if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
raise ValueError(
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
)
filename = original_filename if original_filename else os.path.basename(file_path)
upload_date = datetime.now().isoformat()
metadata: List[Dict[str, Any]] = []
for idx, chunk in enumerate(chunks):
text = chunk if isinstance(chunk, str) else ""
content_summary = text[:200]
entry: Dict[str, Any] = {
"filename": filename,
"upload_date": upload_date,
"content_summary": content_summary,
"chunk_index": idx,
"page_number": page_numbers[idx] if page_numbers else None,
"chunk_file_path": chunk_file_paths[idx] if chunk_file_paths else None,
"document_id": document_id,
}
metadata.append(entry)
return metadata