import re from pathlib import Path from datetime import datetime import pytest import sys from pathlib import Path import importlib.util # Dynamically load the metadata extractor to avoid package-path import issues # The module lives at backend/app/utils/metadata.py relative to this test file. MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py" spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH)) metadata_module = importlib.util.module_from_spec(spec) # type: ignore assert spec is not None and spec.loader is not None spec.loader.exec_module(metadata_module) # type: ignore extract_metadata = getattr(metadata_module, "extract_metadata") def _is_iso8601(s: str) -> bool: try: datetime.fromisoformat(s) return True except ValueError: return False def test_extract_metadata_basic(tmp_path): # Prepare a dummy file path that exists dummy_file = tmp_path / "dir with spaces" / "sample.txt" dummy_file.parent.mkdir(parents=True, exist_ok=True) dummy_file.write_text("content") chunks = ["a" * 250, "short"] metadata = extract_metadata(str(dummy_file), chunks) assert isinstance(metadata, list) assert len(metadata) == 2 # First chunk m0 = metadata[0] assert m0["filename"] == "sample.txt" assert m0["chunk_index"] == 0 assert m0["upload_date"] is not None assert _is_iso8601(m0["upload_date"]) assert m0["content_summary"] == "a" * 200 # Second chunk m1 = metadata[1] assert m1["filename"] == "sample.txt" assert m1["chunk_index"] == 1 assert m1["content_summary"] == "short" def test_extract_metadata_empty_chunks(tmp_path): dummy_file = tmp_path / "file.txt" dummy_file.write_text("data") metadata = extract_metadata(str(dummy_file), []) assert metadata == [] def test_extract_metadata_missing_file_raises(tmp_path): missing = tmp_path / "nonexistent" / "nofile.txt" with pytest.raises(FileNotFoundError): extract_metadata(str(missing), ["data"])