feat(db): Phase 3.1 — SQLite infrastructure (prompts.db + history.db)

- Add sqlite_db.py with dual-DB connection factories (WAL mode, foreign keys)
- init_prompts_db() creates system_prompt_profiles + system_prompts tables
- init_history_db() creates query_history table + created_at index
- seed_default_profiles() inserts 3 profiles (A/B/C) x 3 steps each
- All 3 profiles start with identical seed templates; Profile A active
- Add prompts_db_path + history_db_path to config (./data/ default)
- Startup init in main.py creates data/ dir, inits both DBs, seeds profiles
- Add PROMPTS_DB_PATH + HISTORY_DB_PATH to .env.example
- Add data/ to .gitignore
- 17 new tests in test_phase3_sqlite_db.py (all passing)
This commit is contained in:
Woody 2026-04-25 20:29:29 +08:00
parent b710002c6e
commit f4b404f27d
6 changed files with 553 additions and 0 deletions

3
.gitignore vendored
View File

@ -76,6 +76,9 @@ htmlcov/
# Docker
.dockerignore
# Package 3 — SQLite databases
data/
# Misc
*.bak
*.tmp

View File

@ -14,4 +14,8 @@ CHUNK_OVERLAP=200
RETRIEVAL_N_RESULTS=10
RELEVANCE_THRESHOLD=7.0
# SQLite databases (Package 3)
PROMPTS_DB_PATH=./data/prompts.db
HISTORY_DB_PATH=./data/history.db
CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"]

View File

@ -22,6 +22,10 @@ class Settings(BaseSettings):
# Chunk PDF storage (extracted PDF pages)
document_chunk_path: str = "./document_chunk"
# SQLite databases (Package 3)
prompts_db_path: str = "./data/prompts.db"
history_db_path: str = "./data/history.db"
# App configuration moved to settings for easier testing/configuration
# Cross-origin settings and chunking parameters (Phase 1 plan)
cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]

View File

@ -0,0 +1,145 @@
import logging
import os
import sqlite3
from pathlib import Path
from app.core.config import get_settings
logger = logging.getLogger(__name__)
# ── Seed prompt templates (from current hardcoded service prompts) ──────────
_SEED_DECOMPOSE = (
"Given this question: '{question}'\n\n"
"Break it down into 2-5 simplified sub-questions that would help "
"search for relevant information. Each sub-question should be short "
"and focused on one aspect. Return as a JSON array of strings."
)
_SEED_FILTER = (
"Given question '{question}' and these document chunks, rate each 0-10 for relevance. "
"Return JSON array of scores.\n{chunks}\n"
)
_SEED_GENERATE = (
"Question: {question}\n\n"
"Answer the question using ONLY these document chunks. "
"Do not use any external knowledge. "
"Format your answer as bullet points. "
"Cite your sources inline using the exact bracket labels provided, "
"e.g. [filename, page N]. Place the citation at the end of each relevant point.\n\n"
"Document chunks:\n{context}\n\n"
"Answer:"
)
_SEED_PROFILES = [
("A", 1),
("B", 0),
("C", 0),
]
_SEED_STEPS = ["decompose", "filter", "generate"]
_SEED_TEMPLATES = {
"decompose": _SEED_DECOMPOSE,
"filter": _SEED_FILTER,
"generate": _SEED_GENERATE,
}
# ── Connection factories ────────────────────────────────────────────────────
def _get_db(db_path: str) -> sqlite3.Connection:
os.makedirs(os.path.dirname(db_path), exist_ok=True)
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
return conn
def get_prompts_db() -> sqlite3.Connection:
return _get_db(get_settings().prompts_db_path)
def get_history_db() -> sqlite3.Connection:
return _get_db(get_settings().history_db_path)
# ── Table initialization ────────────────────────────────────────────────────
def init_prompts_db(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS system_prompt_profiles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
is_active INTEGER DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS system_prompts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
profile_id INTEGER NOT NULL,
step_name TEXT NOT NULL,
prompt_template TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY (profile_id) REFERENCES system_prompt_profiles(id) ON DELETE CASCADE,
UNIQUE(profile_id, step_name)
)
""")
conn.commit()
logger.info("Prompts DB tables initialized.")
def init_history_db(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS query_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
input_text TEXT NOT NULL,
extracted_questions TEXT DEFAULT NULL,
decomposer_time_ms INTEGER DEFAULT 0,
retriever_time_ms INTEGER DEFAULT 0,
chunks_retrieved INTEGER DEFAULT 0,
filter_time_ms INTEGER DEFAULT 0,
chunks_filtered INTEGER DEFAULT 0,
generator_time_ms INTEGER DEFAULT 0,
total_time_ms INTEGER DEFAULT 0,
final_answer TEXT DEFAULT NULL,
sources TEXT DEFAULT NULL,
profile_used TEXT DEFAULT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_query_history_created_at
ON query_history(created_at DESC)
""")
conn.commit()
logger.info("History DB tables initialized.")
# ── Seed data ───────────────────────────────────────────────────────────────
def seed_default_profiles(conn: sqlite3.Connection) -> None:
for profile_name, is_active in _SEED_PROFILES:
inserted = conn.execute(
"INSERT OR IGNORE INTO system_prompt_profiles (name, is_active) VALUES (?, ?)",
(profile_name, is_active),
)
if inserted.rowcount == 0:
continue
profile_id = conn.execute(
"SELECT id FROM system_prompt_profiles WHERE name=?",
(profile_name,),
).fetchone()["id"]
for step in _SEED_STEPS:
conn.execute(
"INSERT OR IGNORE INTO system_prompts (profile_id, step_name, prompt_template) VALUES (?, ?, ?)",
(profile_id, step, _SEED_TEMPLATES[step]),
)
conn.commit()
logger.info("Default profiles (A/B/C) seeded.")

View File

@ -8,6 +8,13 @@ from fastapi.middleware.cors import CORSMiddleware
from app.routers import ingest, query, documents
from app.core.config import get_settings
from app.core.sqlite_db import (
get_prompts_db,
get_history_db,
init_prompts_db,
init_history_db,
seed_default_profiles,
)
# Configure logging before app initialization
LOG_DIR = Path(__file__).parent / "log"
@ -46,6 +53,15 @@ app.include_router(ingest.router, prefix="/api/v1")
app.include_router(query.router, prefix="/api/v1")
app.include_router(documents.router, prefix="/api/v1")
_prompts_conn = get_prompts_db()
init_prompts_db(_prompts_conn)
seed_default_profiles(_prompts_conn)
_prompts_conn.close()
_history_conn = get_history_db()
init_history_db(_history_conn)
_history_conn.close()
@app.get("/health")
def health_check():

View File

@ -0,0 +1,381 @@
"""Tests for Package 3.1 SQLite infrastructure — prompts.db + history.db.
Covers: connection factories, WAL mode, foreign keys, table creation, seed data, idempotency.
Uses tmp_path for isolated test databases no real filesystem pollution.
"""
import sqlite3
import pytest
# ── Helpers ────────────────────────────────────────────────────────────────
DEFAULT_PROMPTS_DB = "/tmp/test_prompts.db"
DEFAULT_HISTORY_DB = "/tmp/test_history.db"
def _patch_settings(monkeypatch, prompts_path: str, history_path: str):
"""Patch Settings to use test-specific DB paths."""
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
from app.core.config import get_settings
get_settings.cache_clear()
# ── Config Tests ───────────────────────────────────────────────────────────
def test_config_default_db_paths(monkeypatch):
"""New config fields should have sensible defaults."""
monkeypatch.delenv("PROMPTS_DB_PATH", raising=False)
monkeypatch.delenv("HISTORY_DB_PATH", raising=False)
from app.core.config import Settings
settings = Settings()
assert settings.prompts_db_path == "./data/prompts.db"
assert settings.history_db_path == "./data/history.db"
def test_config_db_paths_from_env(tmp_path, monkeypatch):
"""DB paths should be configurable via environment variables."""
prompts_path = str(tmp_path / "my_prompts.db")
history_path = str(tmp_path / "my_history.db")
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
from app.core.config import Settings
settings = Settings()
assert settings.prompts_db_path == prompts_path
assert settings.history_db_path == history_path
# ── Connection Factory Tests ───────────────────────────────────────────────
def test_get_prompts_db_creates_file_and_dir(tmp_path, monkeypatch):
"""get_prompts_db() should create the DB file and any missing parent dirs."""
prompts_path = str(tmp_path / "subdir" / "prompts.db")
history_path = str(tmp_path / "subdir" / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db
conn = get_prompts_db()
assert conn is not None
import os
assert os.path.isfile(prompts_path)
conn.close()
def test_get_history_db_creates_file_and_dir(tmp_path, monkeypatch):
"""get_history_db() should create the DB file and any missing parent dirs."""
prompts_path = str(tmp_path / "subdir" / "prompts.db")
history_path = str(tmp_path / "subdir" / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_history_db
conn = get_history_db()
assert conn is not None
import os
assert os.path.isfile(history_path)
conn.close()
def test_wal_mode_enabled(tmp_path, monkeypatch):
"""WAL journal mode should be enabled for better concurrency."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db
conn = get_prompts_db()
result = conn.execute("PRAGMA journal_mode").fetchone()
assert result[0].upper() == "WAL"
conn.close()
def test_foreign_keys_enabled(tmp_path, monkeypatch):
"""Foreign key enforcement should be enabled."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db
conn = get_prompts_db()
result = conn.execute("PRAGMA foreign_keys").fetchone()
assert result[0] == 1
conn.close()
def test_row_factory_is_row(tmp_path, monkeypatch):
"""Row factory should be sqlite3.Row for dict-like access."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db
conn = get_prompts_db()
assert conn.row_factory is sqlite3.Row
conn.close()
# ── Table Creation Tests ───────────────────────────────────────────────────
def test_init_prompts_db_creates_tables(tmp_path, monkeypatch):
"""init_prompts_db() should create system_prompt_profiles + system_prompts tables."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db
conn = get_prompts_db()
init_prompts_db(conn)
tables = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
).fetchall()
table_names = {t["name"] for t in tables}
assert "system_prompt_profiles" in table_names
assert "system_prompts" in table_names
conn.close()
def test_init_prompts_db_idempotent(tmp_path, monkeypatch):
"""Calling init_prompts_db() twice should not raise errors."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db
conn = get_prompts_db()
init_prompts_db(conn)
init_prompts_db(conn) # second call — must not raise
conn.close()
def test_init_history_db_creates_table_and_index(tmp_path, monkeypatch):
"""init_history_db() should create query_history table + created_at index."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_history_db, init_history_db
conn = get_history_db()
init_history_db(conn)
tables = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
).fetchall()
table_names = {t["name"] for t in tables}
assert "query_history" in table_names
indexes = conn.execute(
"SELECT name FROM sqlite_master WHERE type='index' ORDER BY name"
).fetchall()
index_names = {i["name"] for i in indexes}
assert "idx_query_history_created_at" in index_names
conn.close()
def test_init_history_db_idempotent(tmp_path, monkeypatch):
"""Calling init_history_db() twice should not raise errors."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_history_db, init_history_db
conn = get_history_db()
init_history_db(conn)
init_history_db(conn) # second call — must not raise
conn.close()
# ── Seed Data Tests ────────────────────────────────────────────────────────
def test_seed_default_profiles_creates_three_profiles(tmp_path, monkeypatch):
"""seed_default_profiles() should insert exactly 3 profiles: A, B, C."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles
conn = get_prompts_db()
init_prompts_db(conn)
seed_default_profiles(conn)
rows = conn.execute(
"SELECT name, is_active FROM system_prompt_profiles ORDER BY id"
).fetchall()
assert len(rows) == 3
names = [r["name"] for r in rows]
assert names == ["A", "B", "C"]
conn.close()
def test_seed_default_profiles_A_is_active(tmp_path, monkeypatch):
"""Profile A should be active (is_active=1), B and C inactive (is_active=0)."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles
conn = get_prompts_db()
init_prompts_db(conn)
seed_default_profiles(conn)
profile_a = conn.execute(
"SELECT name, is_active FROM system_prompt_profiles WHERE name='A'"
).fetchone()
profile_b = conn.execute(
"SELECT name, is_active FROM system_prompt_profiles WHERE name='B'"
).fetchone()
profile_c = conn.execute(
"SELECT name, is_active FROM system_prompt_profiles WHERE name='C'"
).fetchone()
assert profile_a["is_active"] == 1
assert profile_b["is_active"] == 0
assert profile_c["is_active"] == 0
conn.close()
def test_seed_default_profiles_creates_nine_prompts(tmp_path, monkeypatch):
"""seed_default_profiles() should insert 9 prompt rows (3 profiles × 3 steps)."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles
conn = get_prompts_db()
init_prompts_db(conn)
seed_default_profiles(conn)
rows = conn.execute(
"""SELECT sp.step_name, spp.name AS profile_name
FROM system_prompts sp
JOIN system_prompt_profiles spp ON sp.profile_id = spp.id
ORDER BY spp.name, sp.step_name"""
).fetchall()
assert len(rows) == 9
# Each profile should have all 3 steps
for profile in ("A", "B", "C"):
profile_rows = [r for r in rows if r["profile_name"] == profile]
steps = {r["step_name"] for r in profile_rows}
assert steps == {"decompose", "filter", "generate"}
conn.close()
def test_seed_default_profiles_contains_expected_templates(tmp_path, monkeypatch):
"""Templates should contain expected placeholders for each step."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles
conn = get_prompts_db()
init_prompts_db(conn)
seed_default_profiles(conn)
# Decompose must have {question}
decompose_row = conn.execute(
"""SELECT sp.prompt_template
FROM system_prompts sp
JOIN system_prompt_profiles spp ON sp.profile_id = spp.id
WHERE spp.name='A' AND sp.step_name='decompose'"""
).fetchone()
assert decompose_row is not None
assert "{question}" in decompose_row["prompt_template"]
# Filter must have {question} and {chunks}
filter_row = conn.execute(
"""SELECT sp.prompt_template
FROM system_prompts sp
JOIN system_prompt_profiles spp ON sp.profile_id = spp.id
WHERE spp.name='A' AND sp.step_name='filter'"""
).fetchone()
assert filter_row is not None
assert "{question}" in filter_row["prompt_template"]
assert "{chunks}" in filter_row["prompt_template"]
# Generate must have {question} and {context}
generate_row = conn.execute(
"""SELECT sp.prompt_template
FROM system_prompts sp
JOIN system_prompt_profiles spp ON sp.profile_id = spp.id
WHERE spp.name='A' AND sp.step_name='generate'"""
).fetchone()
assert generate_row is not None
assert "{question}" in generate_row["prompt_template"]
assert "{context}" in generate_row["prompt_template"]
conn.close()
def test_seed_default_profiles_idempotent(tmp_path, monkeypatch):
"""Calling seed_default_profiles() twice should not duplicate rows."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles
conn = get_prompts_db()
init_prompts_db(conn)
seed_default_profiles(conn)
seed_default_profiles(conn)
profile_count = conn.execute(
"SELECT COUNT(*) FROM system_prompt_profiles"
).fetchone()[0]
prompt_count = conn.execute(
"SELECT COUNT(*) FROM system_prompts"
).fetchone()[0]
assert profile_count == 3
assert prompt_count == 9
conn.close()
def test_all_three_profiles_have_identical_seed_templates(tmp_path, monkeypatch):
"""All 3 profiles should start with identical seed templates."""
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
_patch_settings(monkeypatch, prompts_path, history_path)
from app.core.sqlite_db import get_prompts_db, init_prompts_db, seed_default_profiles
conn = get_prompts_db()
init_prompts_db(conn)
seed_default_profiles(conn)
for step in ("decompose", "filter", "generate"):
rows = conn.execute(
"""SELECT spp.name, sp.prompt_template
FROM system_prompts sp
JOIN system_prompt_profiles spp ON sp.profile_id = spp.id
WHERE sp.step_name = ?
ORDER BY spp.name""",
(step,),
).fetchall()
templates = [r["prompt_template"] for r in rows]
# All 3 must match
assert templates[0] == templates[1] == templates[2], (
f"Step '{step}' templates differ across profiles: {templates}"
)
conn.close()