legco_ai_assistant/backend/app/utils/metadata.py

101 lines
4.0 KiB
Python

from __future__ import annotations
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
def extract_metadata(
file_path: str,
chunks: List[str],
original_filename: str | None = None,
page_numbers: List[int | None] | None = None,
chunk_file_paths: List[str | None] | None = None,
document_id: str | None = None,
strategy_type: str = "token",
chunk_metadata: List[Dict[str, Any]] | None = None,
) -> List[Dict[str, Any]]:
"""Extract metadata for a list of text chunks.
Core fields (always present):
- filename, upload_date, content_summary, chunk_index
Optional fields (present when provided, None otherwise):
- page_number: page source for the chunk (int or None for non-paginated docs)
- chunk_file_path: path to the per-chunk source file
- document_id: unique identifier linking all chunks to the same document
Package 8 Q&A fields (present when chunk_metadata provided):
- strategy_type, section_type, question_index, question_id, question_text,
section_heading, answer_contains_table, source_page_range, parent_topic
Args:
file_path: Path to the file associated with the chunks.
chunks: List of string chunks to generate metadata for.
original_filename: Override filename stored in metadata when file_path
is a temp file.
page_numbers: Optional per-chunk page numbers. Length must match chunks.
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
document_id: Optional unique document identifier applied to all chunks.
strategy_type: Chunking strategy used ("token" or "question"). Stored in
each chunk's metadata.
chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
Each dict is merged into the corresponding base metadata entry.
Length must match chunks. Fields like question_id, question_index,
section_type, etc. are forwarded to ChromaDB metadata.
Returns:
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
Raises:
FileNotFoundError: If file_path does not exist.
ValueError: If page_numbers or chunk_file_paths length mismatches chunks.
"""
if not chunks:
return []
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
if page_numbers is not None and len(page_numbers) != len(chunks):
raise ValueError(
f"page_numbers length ({len(page_numbers)}) does not match chunks length ({len(chunks)})"
)
if chunk_file_paths is not None and len(chunk_file_paths) != len(chunks):
raise ValueError(
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
)
if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
raise ValueError(
f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
)
filename = original_filename if original_filename else os.path.basename(file_path)
upload_date = datetime.now().isoformat()
metadata: List[Dict[str, Any]] = []
for idx, chunk in enumerate(chunks):
text = chunk if isinstance(chunk, str) else ""
content_summary = text[:200]
entry: Dict[str, Any] = {
"filename": filename,
"upload_date": upload_date,
"content_summary": content_summary,
"chunk_index": idx,
"document_id": document_id,
"strategy_type": strategy_type,
}
page_num = page_numbers[idx] if page_numbers else None
if page_num is not None:
entry["page_number"] = page_num
cfp = chunk_file_paths[idx] if chunk_file_paths else None
if cfp is not None:
entry["chunk_file_path"] = cfp
if chunk_metadata:
entry.update(chunk_metadata[idx])
metadata.append(entry)
return metadata