diff --git a/README.md b/README.md index 3bd91d9..21f5078 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # LegCo Reranker -RAG-powered document Q&A app. Upload PDFs, ask questions in Cantonese, get bullet-point answers with citations. +RAG-powered document Q&A app with video ASR. Upload PDFs, upload videos with Cantonese ASR transcription, ask questions, get bullet-point answers with citations. ## Quick Start (Dev) ```bash # Backend cd backend -cp .env.example .env # edit .env with your LLM API key +cp .env.example .env # edit .env with your LLM API key AND DashScope API key (for video ASR) pip install -r requirements.txt uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload @@ -73,6 +73,12 @@ All configurable via `backend/.env`: | `PROMPTS_DB_PATH` | `./data/prompts.db` | Prompt templates SQLite | | `HISTORY_DB_PATH` | `./data/history.db` | Query history SQLite | | `CORS_ORIGINS` | `["http://localhost:5173","http://localhost:3000"]` | Allowed CORS origins | +| `DASHSCOPE_API_KEY` | — | Alibaba Cloud DashScope API key (for video ASR) | +| `ASR_MODEL_NAME` | `qwen3-asr-flash` | ASR model for batch transcription | +| `ASR_REALTIME_MODEL_NAME` | `qwen3-asr-flash-realtime` | ASR model for real-time streaming | +| `VIDEO_UPLOAD_DIR` | `./uploads` | Video file storage directory | +| `MAX_VIDEO_SIZE_MB` | `300` | Maximum video upload size | +| `SUPPORTED_VIDEO_FORMATS` | `.mp4, .webm, .mov, .avi, .mkv` | Allowed video file extensions | ### Production: Nginx Reverse Proxy @@ -210,9 +216,32 @@ User Question → SSE stream with per-sub-question sources ``` +### Video Q&A (Phase 2) + +``` +Video → Audio → DashScope ASR → Transcript → QueryInput → RAG Pipeline +``` + +**Streaming Mode (real-time):** +- Upload video → press play → transcript flows into QueryInput in real time +- Audio captured from video element (no microphone needed) +- Auto-starts on play, stops on pause/end + +**Full Transcript Mode (batch):** +- Click "Full Transcript" button under video player +- Server extracts audio via ffmpeg → Full DashScope transcription +- Complete transcript fills QueryInput + +**Requirements:** +- `DASHSCOPE_API_KEY` in `.env` +- `ffmpeg` on server (for batch transcription) + ## Notes - PDF upload limit: 300MB +- Video upload limit: 300MB (same as PDF) +- ffmpeg required on server (for video transcription) +- DashScope ASR supports Cantonese (yue), Mandarin (zh), English (en), auto-detect - Desktop only (not mobile-optimized) - No authentication (public demo) - All LLM calls routed through configurable base URL diff --git a/backend/app/routers/video.py b/backend/app/routers/video.py index 0d91a6d..e2ce71e 100644 --- a/backend/app/routers/video.py +++ b/backend/app/routers/video.py @@ -1,4 +1,5 @@ import logging +import time import uuid from pathlib import Path @@ -30,6 +31,9 @@ async def upload_video(file: UploadFile = File(...)): service = _get_video_service() filename = file.filename or "unknown" ext = Path(filename).suffix.lower() + upload_start = time.monotonic() + + logger.info("upload-started filename=%s content_type=%s", filename, file.content_type) total_size = 0 video_id = uuid.uuid4().hex[:12] @@ -53,7 +57,14 @@ async def upload_video(file: UploadFile = File(...)): raise HTTPException(status_code=500, detail="Upload failed") service.validate_video(filename, file.content_type, total_size) - logger.info("Video uploaded: id=%s filename=%s size=%d", video_id, filename, total_size) + upload_duration = time.monotonic() - upload_start + logger.info( + "upload-completed video_id=%s filename=%s size=%d duration=%.2fs", + video_id, + filename, + total_size, + upload_duration, + ) return VideoUploadResponse( video_id=video_id, @@ -89,20 +100,32 @@ async def transcribe_video(video_id: str, language: str = "yue"): detail="DASHSCOPE_API_KEY is not configured. Set it in .env to enable transcription.", ) + transcribe_start = time.monotonic() + logger.info("transcribe-started video_id=%s language=%s", video_id, language) + service = _get_video_service() wav_path = await service.extract_audio(video_id) try: audio_bytes = wav_path.read_bytes() + logger.debug("audio-extracted video_id=%s wav_size=%d", video_id, len(audio_bytes)) asr = ASRClient(settings) text = asr.transcribe_full(audio_bytes, language=language) except Exception as e: - logger.error("Transcription failed for video_id=%s: %s", video_id, e) + logger.error("transcribe-failed video_id=%s error=%s", video_id, e) raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}") finally: if wav_path.exists(): wav_path.unlink(missing_ok=True) + transcribe_duration = time.monotonic() - transcribe_start + logger.info( + "transcribe-completed video_id=%s text_len=%d duration=%.2fs", + video_id, + len(text), + transcribe_duration, + ) + return FullTranscriptResponse( text=text, language=language, diff --git a/backend/app/routers/ws_asr.py b/backend/app/routers/ws_asr.py index 1a5d7c3..5b5497e 100644 --- a/backend/app/routers/ws_asr.py +++ b/backend/app/routers/ws_asr.py @@ -2,6 +2,7 @@ import json import asyncio import base64 import logging +import time from fastapi import APIRouter, WebSocket, WebSocketDisconnect @@ -32,17 +33,19 @@ class DashScopeCallback(OmniRealtimeCallback): self._loop = loop def on_open(self): - logger.info("DashScope realtime connection opened") + logger.info("dashscope-connection-opened") def on_event(self, message): try: event = json.loads(message) if isinstance(message, str) else message + event_type = event.get("type", "") if isinstance(event, dict) else "" + logger.debug("dashscope-event-received type=%s", event_type) self._loop.call_soon_threadsafe(self._queue.put_nowait, event) except Exception as e: - logger.error("DashScope callback error: %s", e) + logger.error("dashscope-callback-error error=%s", e) def on_close(self, code, msg): - logger.info("DashScope realtime closed: code=%s msg=%s", code, msg) + logger.info("dashscope-connection-closed code=%s msg=%s", code, msg) def format_transcription_event(event: dict, accumulated: str) -> dict | None: @@ -74,9 +77,11 @@ def format_transcription_event(event: dict, accumulated: str) -> dict | None: async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventLoop, language: str = "yue"): event_queue: asyncio.Queue = asyncio.Queue() callback = DashScopeCallback(event_queue, loop) + session_start = time.monotonic() conversation = OmniRealtimeConversation( model=get_settings().asr_realtime_model_name, + api_key=get_settings().dashscope_api_key, url="wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime", callback=callback, ) @@ -99,6 +104,7 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL logger.info("dashscope-session-updated lang=%s", language) accumulated_text = "" + chunk_count = 0 async def read_events(): nonlocal accumulated_text @@ -112,6 +118,11 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL transcript = event.get("transcript", "") if transcript and transcript.strip(): accumulated_text = build_display_text(accumulated_text, transcript) + logger.info( + "dashscope-utterance-completed text_len=%d lang=%s", + len(accumulated_text), + result.get("language", "yue"), + ) result["full_text"] = _to_traditional(accumulated_text) await client_ws.send_json(result) @@ -123,34 +134,53 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL s16_bytes = float32_to_s16le(float32_bytes) audio_b64 = base64.b64encode(s16_bytes).decode("ascii") conversation.append_audio(audio_b64) + chunk_count += 1 + logger.debug( + "audio-chunk-received size_bytes=%d sample_count=%d chunk_num=%d", + len(float32_bytes), + len(float32_bytes) // 4, + chunk_count, + ) except WebSocketDisconnect: - pass + logger.warning( + "client-disconnected-mid-session chunks=%d accumulated_len=%d", + chunk_count, + len(accumulated_text), + ) finally: read_task.cancel() try: conversation.close() except Exception: pass - logger.info("dashscope-session-closed text_len=%d", len(accumulated_text)) + duration = time.monotonic() - session_start + logger.info( + "dashscope-session-closed text_len=%d chunks=%d duration=%.1fs", + len(accumulated_text), + chunk_count, + duration, + ) @router.websocket("/ws/asr/{video_id}") async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"): settings = get_settings() + client_host = websocket.client.host if websocket.client else "unknown" if not settings.dashscope_api_key: await websocket.accept() await websocket.send_json({"error": "DASHSCOPE_API_KEY is not configured"}) await websocket.close(code=1011, reason="DASHSCOPE_API_KEY not set") + logger.warning("ws-rejected-no-apikey video_id=%s client=%s", video_id, client_host) return await websocket.accept() loop = asyncio.get_event_loop() - logger.info("ws-connect video_id=%s lang=%s", video_id, language) + logger.info("ws-connect video_id=%s lang=%s client=%s", video_id, language, client_host) try: await _ws_proxy_dashscope(websocket, loop, language) except Exception as e: - logger.error("ws-asr error: %s", e) + logger.error("ws-asr-error video_id=%s error=%s", video_id, e) finally: logger.info("ws-disconnect video_id=%s", video_id) diff --git a/backend/app/services/asr_client.py b/backend/app/services/asr_client.py index af2863a..e457d28 100644 --- a/backend/app/services/asr_client.py +++ b/backend/app/services/asr_client.py @@ -32,13 +32,17 @@ class ASRClient: def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str: audio_b64 = base64.b64encode(audio_bytes).decode() - data_url = f"data:;base64,{audio_b64}" + data_url = f"data:audio/wav;base64,{audio_b64}" client = OpenAI( api_key=self.settings.dashscope_api_key, base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1", ) + asr_options: dict = {} + if language != "auto": + asr_options["language"] = language + resp = client.chat.completions.create( model=self.settings.asr_model_name, messages=[{ # type: ignore[list-item] @@ -48,11 +52,7 @@ class ASRClient: "input_audio": {"data": data_url}, }], }], - extra_body={ - "asr_options": { - "language": language if language != "auto" else None, - } - }, + extra_body={"asr_options": asr_options} if asr_options else None, ) result = resp.choices[0].message.content or "" diff --git a/backend/app/test/acceptance/test_acceptance_phase2_asr.py b/backend/app/test/acceptance/test_acceptance_phase2_asr.py index ed3d480..7762819 100644 --- a/backend/app/test/acceptance/test_acceptance_phase2_asr.py +++ b/backend/app/test/acceptance/test_acceptance_phase2_asr.py @@ -1,23 +1,70 @@ -"""Acceptance test: Phase 2 ASR transcription with real Qwen ASR model. +"""Acceptance test: Phase 2 real-time ASR via DashScope WebSocket. Prerequisites: -- ASR model endpoint accessible (local vLLM or cloud) -- Test audio file available -- WebSocket server running +- DashScope API key configured in .env (DASHSCOPE_API_KEY) +- Backend running at configured host/port +- dashscope Python package installed """ +import os + import pytest @pytest.mark.acceptance @pytest.mark.slow -def test_asr_transcribe_real_audio(): - """Should return accurate transcript from real ASR model.""" - pass # TODO: implement with real ASR API call +def test_websocket_asr_streaming(): + """Connect WebSocket, send audio, receive transcript events.""" + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + pytest.skip("DASHSCOPE_API_KEY not configured") + + import websocket + import struct + import time + + base_url = os.getenv("BACKEND_URL", "http://localhost:8000") + ws_url = base_url.replace("http", "ws") + "/ws/asr/test-video-acceptance?language=yue" + + ws = websocket.create_connection(ws_url, timeout=30) + try: + float32_samples = struct.pack(f"<{16000}f", *[0.0] * 16000) + ws.send_binary(float32_samples) + time.sleep(2) + finally: + ws.close() @pytest.mark.acceptance @pytest.mark.slow -@pytest.mark.asyncio -async def test_ws_asr_streaming(): - """Should stream audio chunks and receive transcripts via WebSocket.""" - pass # TODO: implement with real WebSocket connection +def test_full_transcript_batch(): + """POST transcribe endpoint with real audio from uploaded video.""" + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + pytest.skip("DASHSCOPE_API_KEY not configured") + + test_file = os.getenv("VIDEO_TEST_FILE") + if not test_file or not os.path.exists(test_file): + pytest.skip("VIDEO_TEST_FILE not configured or file not found") + + import requests + + base_url = os.getenv("BACKEND_URL", "http://localhost:8000") + + with open(test_file, "rb") as f: + resp = requests.post( + f"{base_url}/api/v1/video/upload", + files={"file": (os.path.basename(test_file), f, "video/mp4")}, + timeout=60, + ) + assert resp.status_code == 200 + video_id = resp.json()["video_id"] + + resp = requests.post( + f"{base_url}/api/v1/video/{video_id}/transcribe?language=yue", + timeout=300, + ) + assert resp.status_code == 200 + data = resp.json() + assert "text" in data + assert data["language"] == "yue" + assert len(data["text"]) > 0 diff --git a/backend/app/test/acceptance/test_acceptance_phase2_video.py b/backend/app/test/acceptance/test_acceptance_phase2_video.py index 3c722c8..a520cd3 100644 --- a/backend/app/test/acceptance/test_acceptance_phase2_video.py +++ b/backend/app/test/acceptance/test_acceptance_phase2_video.py @@ -1,22 +1,74 @@ -"""Acceptance test: Phase 2 video upload with real file storage. +"""Acceptance test: Phase 2 video upload + Full Transcript with real DashScope. Prerequisites: -- Backend server running (uvicorn) -- uploads/ directory writable -- Test video file < 300MB available +- DashScope API key configured in .env (DASHSCOPE_API_KEY) +- ffmpeg installed on system +- Real video file available at path specified in VIDEO_TEST_FILE env var (optional) +- Backend server running at configured host/port """ +import os + import pytest +@pytest.mark.acceptance +@pytest.mark.slow +def test_video_upload_and_transcribe_with_real_asr(): + """Upload a real video and get full transcript via DashScope.""" + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + pytest.skip("DASHSCOPE_API_KEY not configured") + + test_file = os.getenv("VIDEO_TEST_FILE") + if not test_file or not os.path.exists(test_file): + pytest.skip("VIDEO_TEST_FILE not configured or file not found") + + import requests + + base_url = os.getenv("BACKEND_URL", "http://localhost:8000") + + with open(test_file, "rb") as f: + resp = requests.post( + f"{base_url}/api/v1/video/upload", + files={"file": (os.path.basename(test_file), f, "video/mp4")}, + timeout=60, + ) + assert resp.status_code == 200, f"Upload failed: {resp.text}" + video_id = resp.json()["video_id"] + + resp = requests.post( + f"{base_url}/api/v1/video/{video_id}/transcribe", + timeout=300, + ) + assert resp.status_code == 200, f"Transcribe failed: {resp.text}" + data = resp.json() + assert "text" in data + assert len(data["text"]) > 0, "Transcript should not be empty" + + @pytest.mark.acceptance @pytest.mark.slow def test_upload_video_mp4(): """Should upload MP4 and return playable video URL.""" - pass # TODO: implement with real HTTP POST to /api/v1/upload-video + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + pytest.skip("DASHSCOPE_API_KEY not configured") + import requests -@pytest.mark.acceptance -@pytest.mark.slow -def test_upload_size_rejection(): - """Should reject files over 300MB with proper error.""" - pass # TODO: implement with real file upload attempt + base_url = os.getenv("BACKEND_URL", "http://localhost:8000") + content = b"\x00" * 1024 + + resp = requests.post( + f"{base_url}/api/v1/video/upload", + files={"file": ("test_acceptance.mp4", content, "video/mp4")}, + timeout=30, + ) + assert resp.status_code == 200 + data = resp.json() + assert "video_id" in data + assert data["url"].startswith("/api/v1/video/") + + resp = requests.get(f"{base_url}{data['url']}", timeout=30) + assert resp.status_code == 200 + assert resp.headers["content-type"] == "video/mp4" diff --git a/backend/app/test/acceptance/test_integration_phase2.py b/backend/app/test/acceptance/test_integration_phase2.py index b6264a3..3f3e96a 100644 --- a/backend/app/test/acceptance/test_integration_phase2.py +++ b/backend/app/test/acceptance/test_integration_phase2.py @@ -1,16 +1,57 @@ -"""Acceptance test: End-to-end Phase 2 — video → ASR → RAG → answer. +"""Acceptance test: Phase 2 end-to-end video → ASR → query flow. Prerequisites: -- Full backend running (uvicorn) -- ChromaDB initialized with test documents -- LLM and ASR providers configured and accessible -- Test video file with known audio content +- All Phase 2 services running (uvicorn) +- DashScope API key configured in .env (DASHSCOPE_API_KEY) +- ChromaDB with test documents ingested +- Test video file with known Cantonese audio content +- LLM provider configured and accessible """ +import os + import pytest @pytest.mark.acceptance @pytest.mark.slow -def test_e2e_phase2_video_query(): - """Should upload video, transcribe, and answer from transcript.""" - pass # TODO: implement full flow: upload → ASR → query → verify answer +def test_e2e_video_transcript_to_rag(): + """Upload video → full transcript → RAG query → answer with citations.""" + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + pytest.skip("DASHSCOPE_API_KEY not configured") + + llm_key = os.getenv("LLM_API_KEY") + if not llm_key: + pytest.skip("LLM_API_KEY not configured") + + test_file = os.getenv("VIDEO_TEST_FILE") + if not test_file or not os.path.exists(test_file): + pytest.skip("VIDEO_TEST_FILE not configured or file not found") + + import requests + + base_url = os.getenv("BACKEND_URL", "http://localhost:8000") + + with open(test_file, "rb") as f: + resp = requests.post( + f"{base_url}/api/v1/video/upload", + files={"file": (os.path.basename(test_file), f, "video/mp4")}, + timeout=60, + ) + assert resp.status_code == 200, f"Upload failed: {resp.text}" + video_id = resp.json()["video_id"] + + resp = requests.post( + f"{base_url}/api/v1/video/{video_id}/transcribe", + timeout=300, + ) + assert resp.status_code == 200, f"Transcribe failed: {resp.text}" + transcript = resp.json()["text"] + assert len(transcript) > 0, "Transcript should not be empty" + + resp = requests.post( + f"{base_url}/api/v1/query", + json={"question": transcript[:200]}, + timeout=120, + ) + assert resp.status_code == 200 diff --git a/backend/app/test/test_integration_phase2.py b/backend/app/test/test_integration_phase2.py new file mode 100644 index 0000000..dab6714 --- /dev/null +++ b/backend/app/test/test_integration_phase2.py @@ -0,0 +1,215 @@ +"""Integration test: Phase 2 end-to-end video upload, serve, transcribe, delete. + +Covers: +- Full upload → transcribe flow (mocked ASR, real file I/O) +- Transcribe with missing video returns 404 +- Transcribe with ffmpeg failure returns 500 +- Serve uploaded video returns correct content-type +- Upload → serve → delete flow + +All external APIs (DashScope, ffmpeg) are mocked. Real FastAPI TestClient +with real file I/O via tmp_path. +""" +import time +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from app.routers.video import router + + +@pytest.fixture +def video_client(tmp_path, monkeypatch): + upload_dir = tmp_path / "test_uploads" + upload_dir.mkdir() + monkeypatch.setenv("VIDEO_UPLOAD_DIR", str(upload_dir)) + monkeypatch.setenv("MAX_VIDEO_SIZE_MB", "50") + monkeypatch.setenv("DASHSCOPE_API_KEY", "sk-test-key") + + from app.core.config import get_settings + get_settings.cache_clear() + app = FastAPI() + app.include_router(router, prefix="/api/v1") + return TestClient(app), upload_dir + + +def _upload_video(client, filename="test.mp4", content=b"\x00" * 1024): + """Helper: upload a video, return video_id.""" + resp = client.post( + "/api/v1/video/upload", + files={"file": (filename, content, "video/mp4")}, + ) + assert resp.status_code == 200 + return resp.json()["video_id"] + + +class TestUploadTranscribeFlow: + """Full upload → transcribe with mocked ASR and real file I/O.""" + + @patch("app.services.asr_client.OpenAI") + @patch("app.services.video_service.asyncio.create_subprocess_exec") + def test_upload_then_transcribe(self, mock_subprocess, mock_openai_cls, video_client): + """Upload video → extract audio (mocked ffmpeg) → transcribe (mocked ASR) → verify response.""" + client, upload_dir = video_client + + # 1. Upload video + content = b"\x00" * 2048 + video_id = _upload_video(client, content=content) + + # 2. Mock ffmpeg subprocess to produce a fake WAV file + async def fake_ffmpeg(*args, **kwargs): + # Write a fake WAV so the transcribe endpoint can read it + output_path = upload_dir / f"{video_id}_audio.wav" + output_path.write_bytes(b"RIFF" + b"\x00" * 100) + + proc = AsyncMock() + proc.returncode = 0 + proc.communicate = AsyncMock(return_value=(b"ffmpeg output", b"")) + return proc + + mock_subprocess.side_effect = fake_ffmpeg + + # 3. Mock ASR client (OpenAI-compatible DashScope call) + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = "呢個係測試轉錄結果" + + mock_openai_instance = MagicMock() + mock_openai_instance.chat.completions.create.return_value = mock_resp + mock_openai_cls.return_value = mock_openai_instance + + # 4. Call transcribe + resp = client.post(f"/api/v1/video/{video_id}/transcribe") + assert resp.status_code == 200 + data = resp.json() + assert "text" in data + assert data["language"] == "yue" + assert len(data["text"]) > 0 + + # 5. Verify temp WAV was cleaned up + wav_path = upload_dir / f"{video_id}_audio.wav" + assert not wav_path.exists(), "Temp WAV file should be cleaned up after transcription" + + @patch("app.services.asr_client.OpenAI") + @patch("app.services.video_service.asyncio.create_subprocess_exec") + def test_upload_transcribe_custom_language(self, mock_subprocess, mock_openai_cls, video_client): + """Transcribe with language=en should pass it through.""" + client, upload_dir = video_client + video_id = _upload_video(client) + + async def fake_ffmpeg(*args, **kwargs): + output_path = upload_dir / f"{video_id}_audio.wav" + output_path.write_bytes(b"RIFF" + b"\x00" * 50) + proc = AsyncMock() + proc.returncode = 0 + proc.communicate = AsyncMock(return_value=(b"", b"")) + return proc + + mock_subprocess.side_effect = fake_ffmpeg + + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = "Hello world transcript" + + mock_openai_instance = MagicMock() + mock_openai_instance.chat.completions.create.return_value = mock_resp + mock_openai_cls.return_value = mock_openai_instance + + resp = client.post(f"/api/v1/video/{video_id}/transcribe?language=en") + assert resp.status_code == 200 + assert resp.json()["language"] == "en" + assert "Hello" in resp.json()["text"] + + +class TestTranscribeMissingVideo: + """Transcribe on nonexistent video_id → 404.""" + + def test_transcribe_404_for_unknown_video(self, video_client): + client, _ = video_client + resp = client.post("/api/v1/video/nonexistent-video-id/transcribe") + assert resp.status_code == 404 + + +class TestTranscribeFFmpegFailure: + """Transcribe with ffmpeg failure → 500.""" + + @patch("app.services.video_service.asyncio.create_subprocess_exec") + def test_transcribe_ffmpeg_failure_returns_500(self, mock_subprocess, video_client): + """If ffmpeg exits non-zero, transcribe should return 500.""" + client, upload_dir = video_client + video_id = _upload_video(client) + + async def failing_ffmpeg(*args, **kwargs): + proc = AsyncMock() + proc.returncode = 1 + proc.communicate = AsyncMock(return_value=(b"", b"Error: Invalid data found")) + return proc + + mock_subprocess.side_effect = failing_ffmpeg + + resp = client.post(f"/api/v1/video/{video_id}/transcribe") + assert resp.status_code == 500 + assert "Audio extraction failed" in resp.json()["detail"] + + +class TestServeVideoContentType: + """Serve uploaded video returns correct content-type.""" + + def test_serve_mp4_content_type(self, video_client): + client, _ = video_client + content = b"\x00" * 512 + video_id = _upload_video(client, content=content) + + resp = client.get(f"/api/v1/video/{video_id}") + assert resp.status_code == 200 + assert resp.headers["content-type"] == "video/mp4" + assert resp.content == content + + def test_serve_webm_content_type(self, video_client): + client, _ = video_client + content = b"\x00" * 256 + resp = client.post( + "/api/v1/video/upload", + files={"file": ("test.webm", content, "video/webm")}, + ) + assert resp.status_code == 200 + video_id = resp.json()["video_id"] + + resp = client.get(f"/api/v1/video/{video_id}") + assert resp.status_code == 200 + assert resp.headers["content-type"] == "video/webm" + assert resp.content == content + + +class TestUploadServeDeleteFlow: + """Full lifecycle: upload → serve → delete → 404.""" + + def test_upload_serve_delete_lifecycle(self, video_client): + client, upload_dir = video_client + content = b"\x00" * 1024 + + # 1. Upload + video_id = _upload_video(client, content=content) + + # 2. Serve — verify exists and content matches + resp = client.get(f"/api/v1/video/{video_id}") + assert resp.status_code == 200 + assert resp.content == content + + # 3. Delete via VideoService directly + from app.core.config import get_settings + get_settings.cache_clear() + from app.services.video_service import VideoService + service = VideoService( + upload_dir=str(upload_dir), + max_size_mb=50, + supported_formats=[".mp4", ".webm", ".mov", ".avi", ".mkv"], + ) + service.delete_video(video_id) + + # 4. Verify 404 after deletion + resp = client.get(f"/api/v1/video/{video_id}") + assert resp.status_code == 404 diff --git a/backend/app/test/test_phase2_asr_client.py b/backend/app/test/test_phase2_asr_client.py index 418333e..ef8721b 100644 --- a/backend/app/test/test_phase2_asr_client.py +++ b/backend/app/test/test_phase2_asr_client.py @@ -191,4 +191,4 @@ class TestTranscribeFull: client.transcribe_full(b"audio", language="auto") call_kwargs = mock_openai_client.chat.completions.create.call_args - assert call_kwargs.kwargs["extra_body"]["asr_options"]["language"] is None + assert call_kwargs.kwargs.get("extra_body") is None diff --git a/frontend/src/components/QueryInput.tsx b/frontend/src/components/QueryInput.tsx index 20fdeba..82e2478 100644 --- a/frontend/src/components/QueryInput.tsx +++ b/frontend/src/components/QueryInput.tsx @@ -1,17 +1,25 @@ -import React, { useState, type FormEvent, type KeyboardEvent } from 'react' +import React, { useState, useEffect, type FormEvent, type KeyboardEvent } from 'react' export interface QueryInputProps { onSubmit: (question: string) => void isLoading: boolean partialText?: string + value?: string } -export const QueryInput: React.FC = ({ onSubmit, isLoading, partialText }) => { +export const QueryInput: React.FC = ({ onSubmit, isLoading, partialText, value }) => { const [question, setQuestion] = useState('') const [submittedQuestion, setSubmittedQuestion] = useState(null) const [hasUserInput, setHasUserInput] = useState(false) - const displayValue = hasUserInput ? question : (partialText ?? question) + useEffect(() => { + if (value !== undefined) { + setQuestion(value) + setHasUserInput(false) + } + }, [value]) + + const displayValue = hasUserInput ? question : (partialText || question) const showPartialStyle = !hasUserInput && !!partialText const handleSubmit = (e: FormEvent): void => { diff --git a/frontend/src/hooks/useFullTranscript.ts b/frontend/src/hooks/useFullTranscript.ts index f6c7e8d..f12c42a 100644 --- a/frontend/src/hooks/useFullTranscript.ts +++ b/frontend/src/hooks/useFullTranscript.ts @@ -13,7 +13,8 @@ export function useFullTranscript({ videoId }: UseFullTranscriptOptions) { setIsLoading(true) setError(null) try { - const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, { + const base = import.meta.env.VITE_API_BASE_URL ?? '' + const resp = await fetch(`${base}/api/v1/video/${videoId}/transcribe`, { method: 'POST', }) if (!resp.ok) { diff --git a/frontend/src/hooks/useVideoASR.ts b/frontend/src/hooks/useVideoASR.ts index 3299c2a..8f187d0 100644 --- a/frontend/src/hooks/useVideoASR.ts +++ b/frontend/src/hooks/useVideoASR.ts @@ -29,7 +29,8 @@ export function useVideoASR({ const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' const host = window.location.host const langParam = language !== 'auto' ? `?language=${language}` : '' - return `${protocol}//${host}/ws/asr/${videoId}${langParam}` + const backendHost = import.meta.env.VITE_WS_HOST ?? host + return `${protocol}//${backendHost}/ws/asr/${videoId}${langParam}` }, [videoId, language]) const startStreaming = useCallback(() => { diff --git a/frontend/src/pages/LTTPage.tsx b/frontend/src/pages/LTTPage.tsx index b536444..392d6f5 100644 --- a/frontend/src/pages/LTTPage.tsx +++ b/frontend/src/pages/LTTPage.tsx @@ -1,4 +1,4 @@ -import React, { useState, useRef, useCallback, useEffect } from 'react' +import React, { useState, useCallback, useEffect } from 'react' import { Loader2, AlertCircle, FileText } from 'lucide-react' import { Group, Panel, Separator } from 'react-resizable-panels' import { useQueryDocumentStream } from '../lib/queries' @@ -14,13 +14,13 @@ import { VideoPlayer } from '../components/VideoPlayer' export const LTTPage: React.FC = () => { const [currentVideoId, setCurrentVideoId] = useState(null) const [queryText, setQueryText] = useState('') - const videoRef = useRef(null) + const [videoEl, setVideoEl] = useState(null) const queryStream = useQueryDocumentStream() const asr = useVideoASR({ videoId: currentVideoId ?? '', - videoElement: videoRef.current, + videoElement: videoEl, language: 'yue', onFinalTranscript: (text) => { setQueryText(text) @@ -41,6 +41,7 @@ export const LTTPage: React.FC = () => { const handleQuerySubmit = (question: string): void => { queryStream.mutate({ question }) + setQueryText('') } const handleRequestFullTranscript = useCallback(() => { @@ -66,7 +67,7 @@ export const LTTPage: React.FC = () => {
{currentVideoId ? ( <> - +