From 3c2d6479437ad46709914f731fe04390021da3b2 Mon Sep 17 00:00:00 2001 From: Woody Date: Wed, 22 Apr 2026 15:22:29 +0800 Subject: [PATCH] init: project setup with AGENTS.md, test structure, and plan directory --- .env.txt | 1 + AGENTS.md | 148 +++++++++++++++++++ backend/app/test/conftest.py | 23 +++ backend/app/test/test_phase1_chunking.py | 24 +++ backend/app/test/test_phase1_ingest.py | 29 ++++ backend/app/test/test_phase1_llm_client.py | 25 ++++ backend/app/test/test_phase1_metadata.py | 25 ++++ backend/app/test/test_phase1_query.py | 25 ++++ backend/app/test/test_phase1_rag_service.py | 25 ++++ backend/app/test/test_phase2_asr_client.py | 25 ++++ backend/app/test/test_phase2_video_upload.py | 29 ++++ backend/app/test/test_phase2_ws_asr.py | 28 ++++ development_plan.md | 140 ++++++++++++++++++ 13 files changed, 547 insertions(+) create mode 100644 .env.txt create mode 100644 AGENTS.md create mode 100644 backend/app/test/conftest.py create mode 100644 backend/app/test/test_phase1_chunking.py create mode 100644 backend/app/test/test_phase1_ingest.py create mode 100644 backend/app/test/test_phase1_llm_client.py create mode 100644 backend/app/test/test_phase1_metadata.py create mode 100644 backend/app/test/test_phase1_query.py create mode 100644 backend/app/test/test_phase1_rag_service.py create mode 100644 backend/app/test/test_phase2_asr_client.py create mode 100644 backend/app/test/test_phase2_video_upload.py create mode 100644 backend/app/test/test_phase2_ws_asr.py create mode 100644 development_plan.md diff --git a/.env.txt b/.env.txt new file mode 100644 index 0000000..d2358a2 --- /dev/null +++ b/.env.txt @@ -0,0 +1 @@ +ALIBABA=sk-e84c76a30243448dadfd6eab6d90c3f2 \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..d011d76 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,148 @@ +# RAG Video Q&A — Project Knowledge Base + +**Generated:** 2026-04-22 +**Source:** development_plan.md +**Status:** Greenfield (no code yet) + +--- + +## OVERVIEW +RAG-powered Video Q&A web app. Phase 1: text → ChromaDB retrieval → bullet-point answer. Phase 2: video upload → real-time ASR → auto/manual RAG query. FastAPI backend + React 18 (Vite) frontend. + +## STRUCTURE +``` +app/ +├── backend/ # FastAPI (Python) +│ ├── app/ +│ │ ├── main.py +│ │ ├── routers/ # query.py, ingest.py, video.py, ws_asr.py +│ │ ├── services/ # rag.py, llm_client.py, asr_client.py, video_service.py +│ │ ├── models/ # Pydantic schemas +│ │ ├── core/ # config.py, database.py +│ │ └── utils/ # chunking.py, metadata_extraction.py +│ ├── uploads/ # video storage (max 300MB) +│ ├── requirements.txt +│ └── .env.example +├── frontend/ # React 18 + TS + Vite +│ ├── src/ +│ │ ├── components/ # shadcn/ui + custom +│ │ ├── pages/ +│ │ ├── lib/ +│ │ │ └── api.ts # API client (TanStack Query) +│ │ └── App.tsx +│ ├── package.json +│ └── vite.config.ts +├── chroma_db/ # Persistent vector store +├── Dockerfile +├── docker-compose.yml +├── nginx.conf +└── deploy.sh +``` + +## WHERE TO LOOK +| Task | Location | Notes | +|------|----------|-------| +| API routes | `backend/app/routers/` | Versioned `/api/v1/...` | +| Business logic | `backend/app/services/` | RAG, LLM, ASR, video | +| Schemas | `backend/app/models/` | Pydantic request/response | +| Config | `backend/app/core/config.py` | `.env` driven | +| DB init | `backend/app/core/database.py` | ChromaDB persistent | +| Frontend API | `frontend/src/lib/api.ts` | TanStack Query | +| UI components | `frontend/src/components/` | shadcn/ui + Tailwind | + +## CODE MAP +*Greenfield — no code yet. See development_plan.md for full specification.* + +## CONVENTIONS +- **Backend**: `snake_case` files; routers thin, services thick; `.env` for all LLM/ASR config +- **Frontend**: PascalCase components; `lib/api.ts` single API client; TanStack Query for server state +- **API**: Path versioning `/api/v1/`; WebSocket at `/ws/asr/{video_id}` +- **RAG**: Strict prompt — answer ONLY from retrieved context; bullet-point format +- **Metadata**: Every doc chunk must have `filename`, `upload_date`, `content_summary` + +## ANTI-PATTERNS (THIS PROJECT) +- Hardcode LLM URLs/keys — always `.env` +- Business logic in routers — belongs in `services/` +- Non-persistent ChromaDB — must use `chroma_db/` directory +- LLM hallucination outside retrieved context — strict RAG prompt enforced +- Plain text responses — always bullet points with source metadata +- Missing document metadata — breaks source attribution +- Add authentication — public demo only +- Mobile-first design — desktop only at this stage + +## UNIQUE STYLES +- **Dual ASR trigger**: automatic (on transcript update) + manual "Ask from Video" button +- **Layout**: Top-Left video player | Top-Right transcript + input | Bottom RAG response +- **Provider switching**: same codebase runs dev (OpenRouter/Alibaba Cloud) and prod (local vLLM) +- **Video limit**: 300MB max, MP4 + common formats + +## TESTING + +**Backend test directory**: `backend/app/test/` + +**Naming convention** (pytest, flat structure, phase-prefixed): +``` +test_phase_.py +``` + +**Examples**: +- `test_phase1_ingest.py` — Document upload & ChromaDB ingestion +- `test_phase1_query.py` — RAG query endpoint +- `test_phase1_rag_service.py` — RAG retrieval + strict prompt logic +- `test_phase1_llm_client.py` — LLM client (mocked provider) +- `test_phase1_chunking.py` — Document chunking utils +- `test_phase1_metadata.py` — Metadata extraction +- `test_phase2_video_upload.py` — Video upload (<300MB, format validation) +- `test_phase2_asr_client.py` — ASR transcription client +- `test_phase2_ws_asr.py` — WebSocket audio streaming +- `test_phase2_query_from_video.py` — Auto/manual trigger from transcript +- `test_integration_phase1.py` — End-to-end text → RAG → answer +- `test_integration_phase2.py` — End-to-end video → ASR → RAG → answer + +**Rules**: +- Use `pytest` + `pytest-asyncio` for async tests +- Mock all external LLM/ASR calls (do not hit live APIs in tests) +- Use `tmp_path` fixture for ChromaDB test instances +- Each test file must have a module-level docstring describing coverage + +## COMMANDS +```bash +# Dev +backend: uvicorn app.main:app --reload --port 8000 +frontend: npm run dev + +# Test +backend: cd backend && pytest app/test/ -v + +# Prod +docker-compose up -d +./deploy.sh +``` + +## PLAN STORAGE + +**All development plans** (including sub-plans, debug plans, and task breakdowns) **must be stored in `.plans/`**. + +``` +.plans/ +├── development_plan.md # Main development plan (root-level) +├── phase1_backend_plan.md # Phase 1 backend tasks +├── phase1_frontend_plan.md # Phase 1 frontend tasks +├── phase2_backend_plan.md # Phase 2 backend tasks +├── phase2_frontend_plan.md # Phase 2 frontend tasks +├── debug__.md # Debug/diagnosis logs +└── _template.md # Plan template (optional) +``` + +**Rules**: +- Name format: `_.md` (snake_case) +- Use `debug_` prefix for troubleshooting logs +- Root `development_plan.md` stays at root as canonical source +- Sub-plans reference root plan, never duplicate it + +## NOTES +- No routing library specified — single-page app likely sufficient +- No client state library specified — `useState`/`useReducer` + TanStack Query +- WebSocket client not specified — may need to expand `lib/api.ts` +- shadcn/ui components are copied, not imported as npm package +- Alibaba Cloud reference: https://modelstudio.console.alibabacloud.com/ap-southeast-1?switchAgent=101503&tab=doc&productCode=p_efm&switchUserType=3#/doc/?type=model&url=2989727 diff --git a/backend/app/test/conftest.py b/backend/app/test/conftest.py new file mode 100644 index 0000000..cedf128 --- /dev/null +++ b/backend/app/test/conftest.py @@ -0,0 +1,23 @@ +"""Shared pytest fixtures for backend tests. + +All external LLM/ASR calls must be mocked. Use tmp_path for ChromaDB instances. +""" +import pytest + + +@pytest.fixture +def mock_llm_client(monkeypatch): + """Mock LLM client to avoid hitting live APIs.""" + pass # TODO: implement mock + + +@pytest.fixture +def mock_asr_client(monkeypatch): + """Mock ASR client to avoid hitting live APIs.""" + pass # TODO: implement mock + + +@pytest.fixture +def chroma_test_dir(tmp_path): + """Provide a temporary directory for isolated ChromaDB instances.""" + return tmp_path / "chroma_test" diff --git a/backend/app/test/test_phase1_chunking.py b/backend/app/test/test_phase1_chunking.py new file mode 100644 index 0000000..2665b6c --- /dev/null +++ b/backend/app/test/test_phase1_chunking.py @@ -0,0 +1,24 @@ +"""Phase 1 tests: Document chunking utilities. + +Covers: +- Text splitting strategies +- Chunk size and overlap parameters +- Handling of different document formats +""" +import pytest + + +class TestChunking: + """Document chunking utility tests.""" + + def test_chunk_size_limit(self): + """Should respect maximum chunk size.""" + pass # TODO: implement + + def test_chunk_overlap(self): + """Should include overlap between adjacent chunks.""" + pass # TODO: implement + + def test_empty_document(self): + """Should handle empty or whitespace-only documents.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase1_ingest.py b/backend/app/test/test_phase1_ingest.py new file mode 100644 index 0000000..324da44 --- /dev/null +++ b/backend/app/test/test_phase1_ingest.py @@ -0,0 +1,29 @@ +"""Phase 1 tests: Document ingestion endpoint. + +Covers: +- POST /api/v1/ingest with valid documents +- Metadata extraction (filename, upload_date, content_summary) +- ChromaDB persistence with embeddings +- Error handling for unsupported file types +""" +import pytest + + +class TestIngest: + """Document upload and ChromaDB ingestion tests.""" + + def test_ingest_pdf_success(self): + """Should ingest PDF and return document ID with metadata.""" + pass # TODO: implement + + def test_ingest_txt_success(self): + """Should ingest plain text and chunk correctly.""" + pass # TODO: implement + + def test_ingest_metadata_extraction(self): + """Should extract filename, upload_date, content_summary.""" + pass # TODO: implement + + def test_ingest_unsupported_format(self): + """Should reject unsupported file formats.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase1_llm_client.py b/backend/app/test/test_phase1_llm_client.py new file mode 100644 index 0000000..2ebff2a --- /dev/null +++ b/backend/app/test/test_phase1_llm_client.py @@ -0,0 +1,25 @@ +"""Phase 1 tests: LLM client. + +Covers: +- OpenAI-compatible API client for Qwen LLM +- Provider switching via .env (OpenRouter, Alibaba Cloud, vLLM) +- Error handling for API failures +- Mocked responses in test mode +""" +import pytest + + +class TestLLMClient: + """LLM client tests (all external calls mocked).""" + + def test_llm_call_success(self, mock_llm_client): + """Should return structured response from mocked LLM.""" + pass # TODO: implement + + def test_llm_provider_switching(self): + """Should switch base URL based on .env config.""" + pass # TODO: implement + + def test_llm_api_error_handling(self): + """Should handle HTTP errors from LLM provider.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase1_metadata.py b/backend/app/test/test_phase1_metadata.py new file mode 100644 index 0000000..8dc7c38 --- /dev/null +++ b/backend/app/test/test_phase1_metadata.py @@ -0,0 +1,25 @@ +"""Phase 1 tests: Metadata extraction utilities. + +Covers: +- Filename extraction +- Upload date generation +- Content summary generation +- Metadata schema validation +""" +import pytest + + +class TestMetadata: + """Metadata extraction utility tests.""" + + def test_extract_filename(self): + """Should extract clean filename from path.""" + pass # TODO: implement + + def test_generate_upload_date(self): + """Should generate ISO format upload date.""" + pass # TODO: implement + + def test_content_summary(self): + """Should generate concise content summary.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase1_query.py b/backend/app/test/test_phase1_query.py new file mode 100644 index 0000000..872d5c0 --- /dev/null +++ b/backend/app/test/test_phase1_query.py @@ -0,0 +1,25 @@ +"""Phase 1 tests: RAG query endpoint. + +Covers: +- POST /api/v1/query question → retrieve → LLM → bullet-point response +- Strict RAG prompt enforcement (only use retrieved context) +- Bullet-point response format +- Source metadata inclusion +""" +import pytest + + +class TestQuery: + """RAG query endpoint tests.""" + + def test_query_returns_bullets(self): + """Should return bullet-point answer with source metadata.""" + pass # TODO: implement + + def test_query_strict_rag_no_hallucination(self): + """Should refuse to answer when no relevant context retrieved.""" + pass # TODO: implement + + def test_query_includes_source_metadata(self): + """Should include filename, upload_date in response.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase1_rag_service.py b/backend/app/test/test_phase1_rag_service.py new file mode 100644 index 0000000..cd31250 --- /dev/null +++ b/backend/app/test/test_phase1_rag_service.py @@ -0,0 +1,25 @@ +"""Phase 1 tests: RAG service logic. + +Covers: +- ChromaDB retrieval with Qwen embeddings +- Context assembly for LLM prompt +- Strict prompt construction (answer ONLY from retrieved context) +- Metadata handling per chunk +""" +import pytest + + +class TestRAGService: + """RAG retrieval and prompt logic tests.""" + + def test_retrieve_relevant_chunks(self): + """Should retrieve semantically relevant chunks from ChromaDB.""" + pass # TODO: implement + + def test_strict_prompt_format(self): + """Should construct prompt forbidding external knowledge.""" + pass # TODO: implement + + def test_chunk_metadata_preserved(self): + """Should preserve filename, upload_date, content_summary per chunk.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase2_asr_client.py b/backend/app/test/test_phase2_asr_client.py new file mode 100644 index 0000000..7e5545c --- /dev/null +++ b/backend/app/test/test_phase2_asr_client.py @@ -0,0 +1,25 @@ +"""Phase 2 tests: ASR transcription client. + +Covers: +- Integration with Qwen/Qwen3-ASR-1.7B +- File upload vs audio content input +- Error handling for transcription failures +- Mocked responses in test mode +""" +import pytest + + +class TestASRClient: + """ASR client tests (all external calls mocked).""" + + def test_asr_transcribe_audio(self, mock_asr_client): + """Should return transcript from mocked ASR.""" + pass # TODO: implement + + def test_asr_file_upload_mode(self): + """Should support file path input.""" + pass # TODO: implement + + def test_asr_audio_content_mode(self): + """Should support raw audio bytes input.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase2_video_upload.py b/backend/app/test/test_phase2_video_upload.py new file mode 100644 index 0000000..c1c4a26 --- /dev/null +++ b/backend/app/test/test_phase2_video_upload.py @@ -0,0 +1,29 @@ +"""Phase 2 tests: Video upload endpoint. + +Covers: +- POST /api/v1/upload-video with size validation (<300MB) +- Format validation (MP4 and common formats) +- Static file serving +- Error handling for oversized/invalid files +""" +import pytest + + +class TestVideoUpload: + """Video upload endpoint tests.""" + + def test_upload_mp4_success(self): + """Should accept valid MP4 under 300MB.""" + pass # TODO: implement + + def test_upload_size_limit(self): + """Should reject files over 300MB.""" + pass # TODO: implement + + def test_upload_invalid_format(self): + """Should reject non-video formats.""" + pass # TODO: implement + + def test_static_file_serving(self): + """Should serve uploaded video via static URL.""" + pass # TODO: implement diff --git a/backend/app/test/test_phase2_ws_asr.py b/backend/app/test/test_phase2_ws_asr.py new file mode 100644 index 0000000..713a226 --- /dev/null +++ b/backend/app/test/test_phase2_ws_asr.py @@ -0,0 +1,28 @@ +"""Phase 2 tests: WebSocket ASR streaming. + +Covers: +- /ws/asr/{video_id} connection lifecycle +- Real-time audio chunk streaming +- Transcript accumulation +- Connection cleanup on disconnect +""" +import pytest + + +class TestWebSocketASR: + """WebSocket ASR streaming tests.""" + + @pytest.mark.asyncio + async def test_ws_connection_established(self): + """Should accept WebSocket connection with valid video_id.""" + pass # TODO: implement + + @pytest.mark.asyncio + async def test_ws_audio_chunk_streaming(self): + """Should process audio chunks and return transcripts.""" + pass # TODO: implement + + @pytest.mark.asyncio + async def test_ws_disconnect_cleanup(self): + """Should clean up resources on client disconnect.""" + pass # TODO: implement diff --git a/development_plan.md b/development_plan.md new file mode 100644 index 0000000..985da00 --- /dev/null +++ b/development_plan.md @@ -0,0 +1,140 @@ +# RAG Video Q&A Web Application - Development Plan + +**Project Overview** +Web-based application built in two phases. +- **Phase 1**: Text question → RAG retrieval → Point-form answer (strictly from database) +- **Phase 2**: Video upload + player → real-time audio streaming → ASR transcription → question extraction → Phase 1 RAG flow + +**Tech Stack** +- **Backend**: Python + FastAPI (REST + WebSocket) +- **Frontend**: TypeScript + React 18 (Vite) + shadcn/ui + Tailwind CSS +- **Server**: Linux Ubuntu 22.04 +- **RAG Database**: ChromaDB (persistent) +- **LLM/ASR Integration**: Dynamic via `.env` (supports local vLLM, OpenRouter, Alibaba Cloud) + - Alibaba Cloud reference: https://modelstudio.console.alibabacloud.com/ap-southeast-1?switchAgent=101503&tab=doc&productCode=p_efm&switchUserType=3#/doc/?type=model&url=2989727 + +- **Models**: + - Embedding: `qwen/qwen3-embedding-4b` + - LLM: `qwen/qwen3.5-35b-a3b` + - ASR: `Qwen/Qwen3-ASR-1.7B` + +**Deployment** +- Development: Simple commands (`uvicorn` + `npm run dev`) +- Production: Docker + Nginx + +--- + +## Project Structure (Monorepo) +app/ +├── backend/ # FastAPI +│ ├── app/ +│ │ ├── main.py +│ │ ├── routers/ # query.py, ingest.py, video.py, ws_asr.py +│ │ ├── services/ # rag.py, llm_client.py, asr_client.py, video_service.py +│ │ ├── models/ # Pydantic schemas +│ │ ├── core/ # config.py, database.py +│ │ └── utils/ # chunking, metadata extraction +│ ├── uploads/ # video storage (max 300MB) +│ ├── requirements.txt +│ └── .env.example +├── frontend/ # React + TypeScript (Vite) +│ ├── src/ +│ │ ├── components/ +│ │ ├── pages/ +│ │ ├── lib/ # api.ts +│ │ └── App.tsx +│ ├── package.json +│ └── vite.config.ts +├── chroma_db/ # Persistent vector store +├── Dockerfile +├── docker-compose.yml +├── nginx.conf +└── deploy.sh + + +--- + +## Key Requirements Incorporated + +- **LLM/ASR Configuration**: Backend reads from `.env` for easy switching between development (OpenRouter / Alibaba Cloud) and production (local vLLM). +- **RAG Database**: ChromaDB with metadata support (filename + extracted content metadata). +- **Embedding Model**: `qwen/qwen3-embedding-4b` via sentence-transformers. +- **Document Ingestion**: Via UI (project-based demo, no user authentication). +- **Video**: MP4 and common formats, maximum 300MB. +- **ASR Flow**: Both **automatic** (on transcript updates) and **manual** “Ask from Video” button. +- **UI Layout**: + - Top-Left: Video player + - Top-Right: Real-time transcript + text input box + - Bottom Half: RAG response (bullet points with source metadata) +- **Authentication**: Public demo (no login required). +- **Mobile**: Not required at this stage. + +--- + +## Phase 1: Text Question → RAG → Point-Form Answer (5-7 days) + +### Backend (FastAPI) +- Dynamic configuration via `.env` (LLM base URL, API key, model names). +- `services/rag.py`: Persistent ChromaDB + Qwen embedding + metadata extraction (filename, upload date, content summary). +- `services/llm_client.py`: OpenAI-compatible client for Qwen LLM with **strict RAG prompt** (only use retrieved context). +- Endpoints: + - `POST /api/v1/ingest` – Document upload and ingestion with metadata. + - `POST /api/v1/query` – Question → retrieve → LLM → bullet-point response. + +### Frontend (React + TS) +- Clean layout: Top-right input box, bottom response area. +- Type-safe API calls using TanStack Query. +- Display answer as clean bullet list with source metadata. + +--- + +## Phase 2: Video Upload + Real-Time ASR → RAG (8-10 days) + +### Backend Additions +- Video upload (`POST /api/v1/upload-video`) with size/format validation (<300MB). +- Static file serving for videos. +- WebSocket `/ws/asr/{video_id}` for real-time audio chunk streaming. +- ASR integration with `Qwen/Qwen3-ASR-1.7B` (file upload or audio content). +- Question extraction via LLM, then trigger Phase 1 RAG (auto + manual support). + +### Frontend Additions +- Drag & drop video upload + progress. +- Video player (`