legco_ai_assistant/backend/app/test/test_phase1_metadata.py

68 lines
2.0 KiB
Python

import re
from pathlib import Path
from datetime import datetime
import pytest
import sys
from pathlib import Path
import importlib.util
# Dynamically load the metadata extractor to avoid package-path import issues
# The module lives at backend/app/utils/metadata.py relative to this test file.
MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py"
spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH))
metadata_module = importlib.util.module_from_spec(spec) # type: ignore
assert spec is not None and spec.loader is not None
spec.loader.exec_module(metadata_module) # type: ignore
extract_metadata = getattr(metadata_module, "extract_metadata")
def _is_iso8601(s: str) -> bool:
try:
datetime.fromisoformat(s)
return True
except ValueError:
return False
def test_extract_metadata_basic(tmp_path):
# Prepare a dummy file path that exists
dummy_file = tmp_path / "dir with spaces" / "sample.txt"
dummy_file.parent.mkdir(parents=True, exist_ok=True)
dummy_file.write_text("content")
chunks = ["a" * 250, "short"]
metadata = extract_metadata(str(dummy_file), chunks)
assert isinstance(metadata, list)
assert len(metadata) == 2
# First chunk
m0 = metadata[0]
assert m0["filename"] == "sample.txt"
assert m0["chunk_index"] == 0
assert m0["upload_date"] is not None
assert _is_iso8601(m0["upload_date"])
assert m0["content_summary"] == "a" * 200
# Second chunk
m1 = metadata[1]
assert m1["filename"] == "sample.txt"
assert m1["chunk_index"] == 1
assert m1["content_summary"] == "short"
def test_extract_metadata_empty_chunks(tmp_path):
dummy_file = tmp_path / "file.txt"
dummy_file.write_text("data")
metadata = extract_metadata(str(dummy_file), [])
assert metadata == []
def test_extract_metadata_missing_file_raises(tmp_path):
missing = tmp_path / "nonexistent" / "nofile.txt"
with pytest.raises(FileNotFoundError):
extract_metadata(str(missing), ["data"])