diff --git a/.gitignore b/.gitignore index 58230fb..b3995a1 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,9 @@ htmlcov/ # Docker .dockerignore +# Package 3 — SQLite databases +data/ + # Misc *.bak *.tmp diff --git a/backend/.env.example b/backend/.env.example index 42558be..72c025b 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -14,4 +14,8 @@ CHUNK_OVERLAP=200 RETRIEVAL_N_RESULTS=10 RELEVANCE_THRESHOLD=7.0 +# SQLite databases (Package 3) +PROMPTS_DB_PATH=./data/prompts.db +HISTORY_DB_PATH=./data/history.db + CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"] diff --git a/backend/app/core/config.py b/backend/app/core/config.py index e4fc90a..bab6b88 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -22,6 +22,10 @@ class Settings(BaseSettings): # Chunk PDF storage (extracted PDF pages) document_chunk_path: str = "./document_chunk" + # SQLite databases (Package 3) + prompts_db_path: str = "./data/prompts.db" + history_db_path: str = "./data/history.db" + # App configuration moved to settings for easier testing/configuration # Cross-origin settings and chunking parameters (Phase 1 plan) cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"] diff --git a/backend/app/core/sqlite_db.py b/backend/app/core/sqlite_db.py new file mode 100644 index 0000000..322c605 --- /dev/null +++ b/backend/app/core/sqlite_db.py @@ -0,0 +1,145 @@ +import logging +import os +import sqlite3 +from pathlib import Path + +from app.core.config import get_settings + +logger = logging.getLogger(__name__) + +# ── Seed prompt templates (from current hardcoded service prompts) ────────── + +_SEED_DECOMPOSE = ( + "Given this question: '{question}'\n\n" + "Break it down into 2-5 simplified sub-questions that would help " + "search for relevant information. Each sub-question should be short " + "and focused on one aspect. Return as a JSON array of strings." +) + +_SEED_FILTER = ( + "Given question '{question}' and these document chunks, rate each 0-10 for relevance. " + "Return JSON array of scores.\n{chunks}\n" +) + +_SEED_GENERATE = ( + "Question: {question}\n\n" + "Answer the question using ONLY these document chunks. " + "Do not use any external knowledge. " + "Format your answer as bullet points. " + "Cite your sources inline using the exact bracket labels provided, " + "e.g. [filename, page N]. Place the citation at the end of each relevant point.\n\n" + "Document chunks:\n{context}\n\n" + "Answer:" +) + +_SEED_PROFILES = [ + ("A", 1), + ("B", 0), + ("C", 0), +] + +_SEED_STEPS = ["decompose", "filter", "generate"] +_SEED_TEMPLATES = { + "decompose": _SEED_DECOMPOSE, + "filter": _SEED_FILTER, + "generate": _SEED_GENERATE, +} + +# ── Connection factories ──────────────────────────────────────────────────── + +def _get_db(db_path: str) -> sqlite3.Connection: + os.makedirs(os.path.dirname(db_path), exist_ok=True) + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA foreign_keys=ON") + return conn + + +def get_prompts_db() -> sqlite3.Connection: + return _get_db(get_settings().prompts_db_path) + + +def get_history_db() -> sqlite3.Connection: + return _get_db(get_settings().history_db_path) + + +# ── Table initialization ──────────────────────────────────────────────────── + +def init_prompts_db(conn: sqlite3.Connection) -> None: + conn.execute(""" + CREATE TABLE IF NOT EXISTS system_prompt_profiles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL UNIQUE, + is_active INTEGER DEFAULT 0, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS system_prompts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + profile_id INTEGER NOT NULL, + step_name TEXT NOT NULL, + prompt_template TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY (profile_id) REFERENCES system_prompt_profiles(id) ON DELETE CASCADE, + UNIQUE(profile_id, step_name) + ) + """) + conn.commit() + logger.info("Prompts DB tables initialized.") + + +def init_history_db(conn: sqlite3.Connection) -> None: + conn.execute(""" + CREATE TABLE IF NOT EXISTS query_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + input_text TEXT NOT NULL, + extracted_questions TEXT DEFAULT NULL, + decomposer_time_ms INTEGER DEFAULT 0, + retriever_time_ms INTEGER DEFAULT 0, + chunks_retrieved INTEGER DEFAULT 0, + filter_time_ms INTEGER DEFAULT 0, + chunks_filtered INTEGER DEFAULT 0, + generator_time_ms INTEGER DEFAULT 0, + total_time_ms INTEGER DEFAULT 0, + final_answer TEXT DEFAULT NULL, + sources TEXT DEFAULT NULL, + profile_used TEXT DEFAULT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_query_history_created_at + ON query_history(created_at DESC) + """) + conn.commit() + logger.info("History DB tables initialized.") + + +# ── Seed data ─────────────────────────────────────────────────────────────── + +def seed_default_profiles(conn: sqlite3.Connection) -> None: + for profile_name, is_active in _SEED_PROFILES: + inserted = conn.execute( + "INSERT OR IGNORE INTO system_prompt_profiles (name, is_active) VALUES (?, ?)", + (profile_name, is_active), + ) + if inserted.rowcount == 0: + continue + + profile_id = conn.execute( + "SELECT id FROM system_prompt_profiles WHERE name=?", + (profile_name,), + ).fetchone()["id"] + + for step in _SEED_STEPS: + conn.execute( + "INSERT OR IGNORE INTO system_prompts (profile_id, step_name, prompt_template) VALUES (?, ?, ?)", + (profile_id, step, _SEED_TEMPLATES[step]), + ) + + conn.commit() + logger.info("Default profiles (A/B/C) seeded.") diff --git a/backend/app/main.py b/backend/app/main.py index e630eb6..42cfb5a 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -8,6 +8,13 @@ from fastapi.middleware.cors import CORSMiddleware from app.routers import ingest, query, documents from app.core.config import get_settings +from app.core.sqlite_db import ( + get_prompts_db, + get_history_db, + init_prompts_db, + init_history_db, + seed_default_profiles, +) # Configure logging before app initialization LOG_DIR = Path(__file__).parent / "log" @@ -46,6 +53,15 @@ app.include_router(ingest.router, prefix="/api/v1") app.include_router(query.router, prefix="/api/v1") app.include_router(documents.router, prefix="/api/v1") +_prompts_conn = get_prompts_db() +init_prompts_db(_prompts_conn) +seed_default_profiles(_prompts_conn) +_prompts_conn.close() + +_history_conn = get_history_db() +init_history_db(_history_conn) +_history_conn.close() + @app.get("/health") def health_check(): diff --git a/backend/app/test/test_phase3_sqlite_db.py b/backend/app/test/test_phase3_sqlite_db.py new file mode 100644 index 0000000..bd56ec3 --- /dev/null +++ b/backend/app/test/test_phase3_sqlite_db.py @@ -0,0 +1,381 @@ +"""Tests for Package 3.1 SQLite infrastructure — prompts.db + history.db. + +Covers: connection factories, WAL mode, foreign keys, table creation, seed data, idempotency. +Uses tmp_path for isolated test databases — no real filesystem pollution. +""" + +import sqlite3 + +import pytest + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +DEFAULT_PROMPTS_DB = "/tmp/test_prompts.db" +DEFAULT_HISTORY_DB = "/tmp/test_history.db" + + +def _patch_settings(monkeypatch, prompts_path: str, history_path: str): + """Patch Settings to use test-specific DB paths.""" + monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path) + monkeypatch.setenv("HISTORY_DB_PATH", history_path) + from app.core.config import get_settings + get_settings.cache_clear() + + +# ── Config Tests ─────────────────────────────────────────────────────────── + +def test_config_default_db_paths(monkeypatch): + """New config fields should have sensible defaults.""" + monkeypatch.delenv("PROMPTS_DB_PATH", raising=False) + monkeypatch.delenv("HISTORY_DB_PATH", raising=False) + + from app.core.config import Settings + + settings = Settings() + assert settings.prompts_db_path == "./data/prompts.db" + assert settings.history_db_path == "./data/history.db" + + +def test_config_db_paths_from_env(tmp_path, monkeypatch): + """DB paths should be configurable via environment variables.""" + prompts_path = str(tmp_path / "my_prompts.db") + history_path = str(tmp_path / "my_history.db") + monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path) + monkeypatch.setenv("HISTORY_DB_PATH", history_path) + + from app.core.config import Settings + + settings = Settings() + assert settings.prompts_db_path == prompts_path + assert settings.history_db_path == history_path + + +# ── Connection Factory Tests ─────────────────────────────────────────────── + +def test_get_prompts_db_creates_file_and_dir(tmp_path, monkeypatch): + """get_prompts_db() should create the DB file and any missing parent dirs.""" + prompts_path = str(tmp_path / "subdir" / "prompts.db") + history_path = str(tmp_path / "subdir" / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db + + conn = get_prompts_db() + assert conn is not None + import os + assert os.path.isfile(prompts_path) + conn.close() + + +def test_get_history_db_creates_file_and_dir(tmp_path, monkeypatch): + """get_history_db() should create the DB file and any missing parent dirs.""" + prompts_path = str(tmp_path / "subdir" / "prompts.db") + history_path = str(tmp_path / "subdir" / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_history_db + + conn = get_history_db() + assert conn is not None + import os + assert os.path.isfile(history_path) + conn.close() + + +def test_wal_mode_enabled(tmp_path, monkeypatch): + """WAL journal mode should be enabled for better concurrency.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db + + conn = get_prompts_db() + result = conn.execute("PRAGMA journal_mode").fetchone() + assert result[0].upper() == "WAL" + conn.close() + + +def test_foreign_keys_enabled(tmp_path, monkeypatch): + """Foreign key enforcement should be enabled.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db + + conn = get_prompts_db() + result = conn.execute("PRAGMA foreign_keys").fetchone() + assert result[0] == 1 + conn.close() + + +def test_row_factory_is_row(tmp_path, monkeypatch): + """Row factory should be sqlite3.Row for dict-like access.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db + + conn = get_prompts_db() + assert conn.row_factory is sqlite3.Row + conn.close() + + +# ── Table Creation Tests ─────────────────────────────────────────────────── + +def test_init_prompts_db_creates_tables(tmp_path, monkeypatch): + """init_prompts_db() should create system_prompt_profiles + system_prompts tables.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db + + conn = get_prompts_db() + init_prompts_db(conn) + + tables = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ).fetchall() + table_names = {t["name"] for t in tables} + assert "system_prompt_profiles" in table_names + assert "system_prompts" in table_names + conn.close() + + +def test_init_prompts_db_idempotent(tmp_path, monkeypatch): + """Calling init_prompts_db() twice should not raise errors.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db + + conn = get_prompts_db() + init_prompts_db(conn) + init_prompts_db(conn) # second call — must not raise + conn.close() + + +def test_init_history_db_creates_table_and_index(tmp_path, monkeypatch): + """init_history_db() should create query_history table + created_at index.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_history_db, init_history_db + + conn = get_history_db() + init_history_db(conn) + + tables = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ).fetchall() + table_names = {t["name"] for t in tables} + assert "query_history" in table_names + + indexes = conn.execute( + "SELECT name FROM sqlite_master WHERE type='index' ORDER BY name" + ).fetchall() + index_names = {i["name"] for i in indexes} + assert "idx_query_history_created_at" in index_names + conn.close() + + +def test_init_history_db_idempotent(tmp_path, monkeypatch): + """Calling init_history_db() twice should not raise errors.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_history_db, init_history_db + + conn = get_history_db() + init_history_db(conn) + init_history_db(conn) # second call — must not raise + conn.close() + + +# ── Seed Data Tests ──────────────────────────────────────────────────────── + +def test_seed_default_profiles_creates_three_profiles(tmp_path, monkeypatch): + """seed_default_profiles() should insert exactly 3 profiles: A, B, C.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles + + conn = get_prompts_db() + init_prompts_db(conn) + seed_default_profiles(conn) + + rows = conn.execute( + "SELECT name, is_active FROM system_prompt_profiles ORDER BY id" + ).fetchall() + assert len(rows) == 3 + names = [r["name"] for r in rows] + assert names == ["A", "B", "C"] + conn.close() + + +def test_seed_default_profiles_A_is_active(tmp_path, monkeypatch): + """Profile A should be active (is_active=1), B and C inactive (is_active=0).""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles + + conn = get_prompts_db() + init_prompts_db(conn) + seed_default_profiles(conn) + + profile_a = conn.execute( + "SELECT name, is_active FROM system_prompt_profiles WHERE name='A'" + ).fetchone() + profile_b = conn.execute( + "SELECT name, is_active FROM system_prompt_profiles WHERE name='B'" + ).fetchone() + profile_c = conn.execute( + "SELECT name, is_active FROM system_prompt_profiles WHERE name='C'" + ).fetchone() + + assert profile_a["is_active"] == 1 + assert profile_b["is_active"] == 0 + assert profile_c["is_active"] == 0 + conn.close() + + +def test_seed_default_profiles_creates_nine_prompts(tmp_path, monkeypatch): + """seed_default_profiles() should insert 9 prompt rows (3 profiles × 3 steps).""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles + + conn = get_prompts_db() + init_prompts_db(conn) + seed_default_profiles(conn) + + rows = conn.execute( + """SELECT sp.step_name, spp.name AS profile_name + FROM system_prompts sp + JOIN system_prompt_profiles spp ON sp.profile_id = spp.id + ORDER BY spp.name, sp.step_name""" + ).fetchall() + assert len(rows) == 9 + + # Each profile should have all 3 steps + for profile in ("A", "B", "C"): + profile_rows = [r for r in rows if r["profile_name"] == profile] + steps = {r["step_name"] for r in profile_rows} + assert steps == {"decompose", "filter", "generate"} + conn.close() + + +def test_seed_default_profiles_contains_expected_templates(tmp_path, monkeypatch): + """Templates should contain expected placeholders for each step.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles + + conn = get_prompts_db() + init_prompts_db(conn) + seed_default_profiles(conn) + + # Decompose must have {question} + decompose_row = conn.execute( + """SELECT sp.prompt_template + FROM system_prompts sp + JOIN system_prompt_profiles spp ON sp.profile_id = spp.id + WHERE spp.name='A' AND sp.step_name='decompose'""" + ).fetchone() + assert decompose_row is not None + assert "{question}" in decompose_row["prompt_template"] + + # Filter must have {question} and {chunks} + filter_row = conn.execute( + """SELECT sp.prompt_template + FROM system_prompts sp + JOIN system_prompt_profiles spp ON sp.profile_id = spp.id + WHERE spp.name='A' AND sp.step_name='filter'""" + ).fetchone() + assert filter_row is not None + assert "{question}" in filter_row["prompt_template"] + assert "{chunks}" in filter_row["prompt_template"] + + # Generate must have {question} and {context} + generate_row = conn.execute( + """SELECT sp.prompt_template + FROM system_prompts sp + JOIN system_prompt_profiles spp ON sp.profile_id = spp.id + WHERE spp.name='A' AND sp.step_name='generate'""" + ).fetchone() + assert generate_row is not None + assert "{question}" in generate_row["prompt_template"] + assert "{context}" in generate_row["prompt_template"] + + conn.close() + + +def test_seed_default_profiles_idempotent(tmp_path, monkeypatch): + """Calling seed_default_profiles() twice should not duplicate rows.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles + + conn = get_prompts_db() + init_prompts_db(conn) + seed_default_profiles(conn) + seed_default_profiles(conn) + + profile_count = conn.execute( + "SELECT COUNT(*) FROM system_prompt_profiles" + ).fetchone()[0] + prompt_count = conn.execute( + "SELECT COUNT(*) FROM system_prompts" + ).fetchone()[0] + + assert profile_count == 3 + assert prompt_count == 9 + conn.close() + + +def test_all_three_profiles_have_identical_seed_templates(tmp_path, monkeypatch): + """All 3 profiles should start with identical seed templates.""" + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + _patch_settings(monkeypatch, prompts_path, history_path) + + from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles + + conn = get_prompts_db() + init_prompts_db(conn) + seed_default_profiles(conn) + + for step in ("decompose", "filter", "generate"): + rows = conn.execute( + """SELECT spp.name, sp.prompt_template + FROM system_prompts sp + JOIN system_prompt_profiles spp ON sp.profile_id = spp.id + WHERE sp.step_name = ? + ORDER BY spp.name""", + (step,), + ).fetchall() + + templates = [r["prompt_template"] for r in rows] + # All 3 must match + assert templates[0] == templates[1] == templates[2], ( + f"Step '{step}' templates differ across profiles: {templates}" + ) + + conn.close()