diff --git a/.plans/phase1_enhancement_plan.md b/.plans/phase1_enhancement_plan.md index 93a4ea5..f6818a6 100644 --- a/.plans/phase1_enhancement_plan.md +++ b/.plans/phase1_enhancement_plan.md @@ -2,7 +2,7 @@ **Source**: User request (2026-04-23) **Scope**: Frontend navigation + RAG Database management page + page-aware chunking with chunk PDFs -**Status**: 🔄 In Progress — Features 1-2 ✅ Complete, Feature 3 pending +**Status**: ✅ Complete — All 3 features implemented (1.5.1–1.5.6) --- @@ -35,8 +35,8 @@ Enhance the existing Phase 1 application with three features: - ~~No routing or multi-page support~~ ✅ Done in Feature 1 - ~~No way to view what's stored in ChromaDB~~ ✅ Backend CRUD done (sub-phase 1.5.2) - ~~No way to delete documents or chunks~~ ✅ Backend CRUD done (sub-phase 1.5.2) -- No page-level awareness in chunking (all pages flattened before token splitting) -- No persistent chunk files (chunks only exist as ChromaDB document text) +- ~~No page-level awareness in chunking (all pages flattened before token splitting)~~ ✅ Done in 1.5.4 +- ~~No persistent chunk files (chunks only exist as ChromaDB document text)~~ ✅ Done in 1.5.5 - No clickable links in RAG responses to view source chunks - ~~Upload only via IngestPanel on the main query page~~ (IngestPanel stays on LTT, upload also coming to RAG DB page) @@ -211,7 +211,7 @@ class DeleteResponse(BaseModel): - [x] User can delete individual chunks (with confirmation) - [x] User can upload documents from this page - [x] Stats displayed: total documents, total chunks -- [ ] Uploading a file with existing filename triggers automatic replacement (old data deleted first) +- [x] Uploading a file with existing filename triggers automatic replacement (old data deleted first) --- @@ -493,19 +493,19 @@ legco_reranker/ ### 3.5 Acceptance Criteria -- [ ] PDF uploads produce page-aware chunks: 1 chunk per page with 200-token overlap from adjacent pages -- [ ] Each page is saved as a separate PDF (original page, not generated text) in `document_chunk/` -- [ ] Chunk PDF filename follows convention: `{filename}_page_{n}.pdf` -- [ ] Page numbers are sequential index (1, 2, 3...), not PDF internal labels -- [ ] Oversized pages are kept as single chunks (never split) -- [ ] `GET /api/v1/chunks/{file_path}/pdf` serves the original chunk PDF -- [ ] RAG response sources include `page_number` and `chunk_file_path` -- [ ] Frontend source cards show page number and clickable link -- [ ] Clicking source link opens/downloads the original chunk PDF -- [ ] DOCX uploads work without page numbers (graceful degradation, no chunk PDFs) -- [ ] Uploading a file with same filename replaces existing document (old chunks + PDFs deleted, new document_id) -- [ ] `document_chunk/` is `.gitignore`d -- [ ] Deleting a document also removes its chunk PDFs from `document_chunk/` +- [x] PDF uploads produce page-aware chunks: 1 chunk per page with 200-token overlap from adjacent pages +- [x] Each page is saved as a separate PDF (original page, not generated text) in `document_chunk/` +- [x] Chunk PDF filename follows convention: `{filename}_page_{n}.pdf` +- [x] Page numbers are sequential index (1, 2, 3...), not PDF internal labels +- [x] Oversized pages are kept as single chunks (never split) +- [x] `GET /api/v1/chunks/{file_path}/pdf` serves the original chunk PDF +- [x] RAG response sources include `page_number` and `chunk_file_path` +- [x] Frontend source cards show page number and clickable link +- [x] Clicking source link opens/downloads the original chunk PDF +- [x] DOCX uploads work without page numbers (graceful degradation, no chunk PDFs) +- [x] Uploading a file with same filename replaces existing document (old chunks + PDFs deleted, new document_id) +- [x] `document_chunk/` is `.gitignore`d +- [x] Deleting a document also removes its chunk PDFs from `document_chunk/` --- @@ -534,7 +534,7 @@ Feature 3 (Page-Aware Chunking) ← Modifies ingestion pipeline | 1.5.3 | 2 | Frontend RAG Database page | None | RAGDatabasePage, DocumentList, ChunkList, DocumentUpload, API hooks | ✅ Complete | | 1.5.4 | 3 | Page-aware parsing & chunking | pdf_parser, chunking, metadata enhancements | None | ✅ Complete | | 1.5.5 | 3 | Chunk PDF generation & storage | pdf_extractor, config, ingest pipeline refactor | None | ✅ Complete | -| 1.5.6 | 3 | Chunk file serving + frontend links | documents router endpoint | ResponsePanel clickable links, ChunkList updates | 📋 Pending | +| 1.5.6 | 3 | Chunk file serving + frontend links | documents router endpoint | ResponsePanel clickable links, ChunkList updates | ✅ Complete | ### Parallelization Opportunities @@ -619,7 +619,7 @@ None — all resolved. | `test_phase1_pdf_parser_pages.py` | parse_pdf_by_page() — multi-page PDFs, single-page, empty | | `test_phase1_page_aware_chunking.py` | chunk_with_pages() — cross-page chunks, single-page chunks | | `test_phase1_pdf_extractor.py` | extract_page_as_pdf() — valid page, out-of-range, corrupt PDF | -| `test_phase1_chunk_serving.py` | GET /chunks/{path}/pdf — valid file, missing file, path traversal | +| `test_phase1_chunk_serving.py` | ✅ GET /chunks/{path}/pdf — valid file, missing file, path traversal, filenames with spaces (5 tests, all pass) | ### Frontend Tests (New Files) @@ -628,7 +628,7 @@ None — all resolved. | `NavBar.test.tsx` | Navigation links, active state | | `RAGDatabasePage.test.tsx` | Document list, delete, upload | | `DocumentList.test.tsx` | Document cards, expand/collapse | -| `ChunkList.test.tsx` | Chunk table, page numbers, PDF links | +| `ChunkList.test.tsx` | ✅ Chunk table, page numbers, PDF links (7 tests, all pass) | ### Acceptance Tests diff --git a/backend/app/models/common.py b/backend/app/models/common.py index f5500c7..da370ef 100644 --- a/backend/app/models/common.py +++ b/backend/app/models/common.py @@ -1,3 +1,5 @@ +from typing import Optional + from pydantic import BaseModel @@ -6,3 +8,5 @@ class SourceMetadata(BaseModel): upload_date: str content_summary: str chunk_index: int + page_number: Optional[int] = None + chunk_file_path: Optional[str] = None diff --git a/backend/app/routers/documents.py b/backend/app/routers/documents.py index ba16725..b79f085 100644 --- a/backend/app/routers/documents.py +++ b/backend/app/routers/documents.py @@ -3,6 +3,7 @@ import logging import os from fastapi import APIRouter, HTTPException +from fastapi.responses import FileResponse from app.models.documents import ( DocumentInfo, @@ -126,3 +127,30 @@ async def delete_chunk(chunk_id: str): deleted=True, message=f"Deleted chunk {chunk_id}", ) + + +@router.get("/chunks/{file_path:path}/pdf") +async def get_chunk_pdf(file_path: str): + """Serve a chunk PDF file from document_chunk/ directory.""" + from app.core.config import get_settings + + # Path traversal protection + if ".." in file_path: + raise HTTPException(status_code=400, detail="Invalid file path") + + settings = get_settings() + base_dir = os.path.realpath(settings.document_chunk_path) + full_path = os.path.realpath(os.path.join(base_dir, file_path)) + + # Ensure resolved path is within base directory + if not full_path.startswith(base_dir + os.sep) and full_path != base_dir: + raise HTTPException(status_code=400, detail="Invalid file path") + + if not os.path.isfile(full_path): + raise HTTPException(status_code=404, detail=f"Chunk file not found: {file_path}") + + return FileResponse( + full_path, + media_type="application/pdf", + filename=os.path.basename(full_path), + ) diff --git a/backend/app/routers/query.py b/backend/app/routers/query.py index ceecbe6..f1cd6ef 100644 --- a/backend/app/routers/query.py +++ b/backend/app/routers/query.py @@ -59,6 +59,8 @@ async def query(request: QueryRequest): upload_date=meta.get("upload_date", ""), content_summary=meta.get("content_summary", ""), chunk_index=meta.get("chunk_index", 0), + page_number=meta.get("page_number"), + chunk_file_path=meta.get("chunk_file_path"), ) for meta in chunk_metadata ] diff --git a/backend/app/test/test_phase1_chunk_serving.py b/backend/app/test/test_phase1_chunk_serving.py new file mode 100644 index 0000000..782006f --- /dev/null +++ b/backend/app/test/test_phase1_chunk_serving.py @@ -0,0 +1,73 @@ +"""Tests for chunk PDF file serving endpoint. + +Coverage: +- GET /api/v1/chunks/{file_path}/pdf — success, 404, path traversal 400 +""" +import os +import tempfile +import unittest +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from app.main import app + + +class TestChunkServing(unittest.TestCase): + """Test GET /api/v1/chunks/{file_path}/pdf endpoint.""" + + def setUp(self): + self.client = TestClient(app) + + def test_get_chunk_pdf_success(self): + """Should serve chunk PDF file with 200 and application/pdf.""" + with tempfile.TemporaryDirectory() as tmp_dir: + test_file = os.path.join(tmp_dir, "test_page_1.pdf") + with open(test_file, "wb") as f: + f.write(b"%PDF-1.4 fake content") + + with patch("app.core.config.get_settings") as mock_settings: + mock_settings.return_value.document_chunk_path = tmp_dir + response = self.client.get("/api/v1/chunks/test_page_1.pdf/pdf") + + self.assertEqual(response.status_code, 200) + self.assertIn("application/pdf", response.headers["content-type"]) + + def test_get_chunk_pdf_not_found(self): + """Should return 404 for non-existent chunk file.""" + with patch("app.core.config.get_settings") as mock_settings: + mock_settings.return_value.document_chunk_path = "/tmp/nonexistent_chunk_dir" + response = self.client.get("/api/v1/chunks/nonexistent.pdf/pdf") + + self.assertEqual(response.status_code, 404) + + def test_get_chunk_pdf_path_traversal_double_dot(self): + """Should reject path traversal with .. (404 due to Starlette normalization).""" + with patch("app.core.config.get_settings") as mock_settings: + mock_settings.return_value.document_chunk_path = "/tmp/fake_chunk_dir" + response = self.client.get("/api/v1/chunks/../etc/passwd/pdf") + + self.assertIn(response.status_code, [400, 404]) + + def test_get_chunk_pdf_path_traversal_symlink_escape(self): + """Should reject resolved path escaping base directory (404 from normalization).""" + with tempfile.TemporaryDirectory() as tmp_dir: + with patch("app.core.config.get_settings") as mock_settings: + mock_settings.return_value.document_chunk_path = tmp_dir + response = self.client.get("/api/v1/chunks/../../etc/passwd/pdf") + + self.assertIn(response.status_code, [400, 404]) + + def test_get_chunk_pdf_with_spaces_in_filename(self): + """Should serve files with spaces in the filename.""" + with tempfile.TemporaryDirectory() as tmp_dir: + test_file = os.path.join(tmp_dir, "NEC4 ACC_page_3.pdf") + with open(test_file, "wb") as f: + f.write(b"%PDF-1.4 fake content") + + with patch("app.core.config.get_settings") as mock_settings: + mock_settings.return_value.document_chunk_path = tmp_dir + response = self.client.get("/api/v1/chunks/NEC4 ACC_page_3.pdf/pdf") + + self.assertEqual(response.status_code, 200) + self.assertIn("application/pdf", response.headers["content-type"]) diff --git a/frontend/src/components/ChunkList.tsx b/frontend/src/components/ChunkList.tsx index 2951b1f..94d634d 100644 --- a/frontend/src/components/ChunkList.tsx +++ b/frontend/src/components/ChunkList.tsx @@ -1,6 +1,7 @@ import React from 'react' import { Trash2 } from 'lucide-react' import type { ChunkInfo } from '../types' +import { getChunkPdfUrl } from '../lib/api' interface ChunkListProps { chunks: ChunkInfo[] @@ -64,6 +65,17 @@ export const ChunkList: React.FC = ({ ? `${chunk.content_summary.slice(0, 100)}...` : chunk.content_summary} + {chunk.chunk_file_path && ( + + View PDF + + )}