56 lines
1.9 KiB
Python
56 lines
1.9 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
|
|
|
|
def extract_metadata(file_path: str, chunks: List[str], original_filename: str | None = None) -> List[Dict[str, Any]]:
|
|
"""Extract metadata for a list of text chunks.
|
|
|
|
For each chunk, create a metadata dictionary containing:
|
|
- filename: basename of the provided file_path (or original_filename if provided)
|
|
- upload_date: ISO 8601 timestamp of when metadata was generated
|
|
- content_summary: first 200 characters of the chunk (or full chunk if shorter)
|
|
- chunk_index: 0-based index of the chunk
|
|
|
|
Args:
|
|
file_path: Path to the file associated with the chunks.
|
|
chunks: List of string chunks to generate metadata for.
|
|
original_filename: Override filename stored in metadata when file_path
|
|
is a temp file.
|
|
|
|
Returns:
|
|
A list of metadata dictionaries, one per chunk. If chunks is empty, returns an empty list.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the provided file_path does not exist.
|
|
"""
|
|
|
|
# Edge case: no chunks to metadataize
|
|
if not chunks:
|
|
return []
|
|
|
|
# Validate file existence up-front to follow the edge-case requirements
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
filename = original_filename if original_filename else os.path.basename(file_path)
|
|
upload_date = datetime.now().isoformat()
|
|
|
|
metadata: List[Dict[str, Any]] = []
|
|
for idx, chunk in enumerate(chunks):
|
|
# Ensure we always have a string for summary extraction
|
|
text = chunk if isinstance(chunk, str) else ""
|
|
content_summary = text[:200]
|
|
metadata.append(
|
|
{
|
|
"filename": filename,
|
|
"upload_date": upload_date,
|
|
"content_summary": content_summary,
|
|
"chunk_index": idx,
|
|
}
|
|
)
|
|
|
|
return metadata
|