feat(backend): add documents CRUD service methods and Pydantic schemas

Add list_documents(), list_chunks(), delete_document(), delete_chunk() to RAGService for ChromaDB document management. New schemas: DocumentInfo, ChunkInfo, DocumentListResponse, DeleteResponse. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-23 19:02:07 +08:00 · 2026-04-23 19:02:07 +08:00 · 178461915a
parent 52c09b86cb
commit 178461915a
2 changed files with 100 additions and 0 deletions
--- a/backend/app/models/documents.py
+++ b/backend/app/models/documents.py
@ -0,0 +1,29 @@
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class DocumentInfo(BaseModel):
+    document_id: str
+    filename: str
+    chunk_count: int
+    upload_date: str
+
+
+class ChunkInfo(BaseModel):
+    chunk_id: str
+    chunk_index: int
+    content_summary: str
+    page_number: Optional[int] = None
+    chunk_file_path: Optional[str] = None
+
+
+class DocumentListResponse(BaseModel):
+    documents: List[DocumentInfo]
+    total_documents: int
+    total_chunks: int
+
+
+class DeleteResponse(BaseModel):
+    deleted: bool
+    message: str
--- a/backend/app/services/rag.py
+++ b/backend/app/services/rag.py
@ -113,3 +113,74 @@ class RAGService:
        )

        return await self.llm_client.complete(prompt=prompt, temperature=0.3, step_name="ResponseGeneration")
+
+    def list_documents(self) -> Tuple[List[Dict[str, Any]], int, int]:
+        from collections import defaultdict
+
+        all_data = self.collection.get(include=["metadatas"])
+
+        if not all_data["metadatas"]:
+            return [], 0, 0
+
+        docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
+
+        for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
+            parts = chunk_id.rsplit("_", 1)
+            doc_id = parts[0] if len(parts) == 2 else chunk_id
+
+            docs[doc_id]["filename"] = meta.get("filename", "unknown")
+            docs[doc_id]["chunk_count"] += 1
+            docs[doc_id]["upload_date"] = meta.get("upload_date", "")
+
+        total_chunks = sum(d["chunk_count"] for d in docs.values())
+        doc_list = [
+            {
+                "document_id": doc_id,
+                "filename": info["filename"],
+                "chunk_count": info["chunk_count"],
+                "upload_date": info["upload_date"],
+            }
+            for doc_id, info in docs.items()
+        ]
+
+        return doc_list, len(doc_list), total_chunks
+
+    def list_chunks(self, document_id: str) -> List[Dict[str, Any]]:
+        all_data = self.collection.get(include=["metadatas"])
+
+        chunks = []
+        for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
+            if chunk_id.startswith(f"{document_id}_"):
+                chunks.append({
+                    "chunk_id": chunk_id,
+                    "chunk_index": meta.get("chunk_index", 0),
+                    "content_summary": meta.get("content_summary", ""),
+                    "page_number": meta.get("page_number"),
+                    "chunk_file_path": meta.get("chunk_file_path"),
+                })
+
+        chunks.sort(key=lambda x: x["chunk_index"])
+        return chunks
+
+    def delete_document(self, document_id: str) -> Tuple[bool, int]:
+        all_data = self.collection.get(include=["metadatas"])
+
+        ids_to_delete = [
+            chunk_id for chunk_id in all_data["ids"]
+            if chunk_id.startswith(f"{document_id}_")
+        ]
+
+        if not ids_to_delete:
+            return False, 0
+
+        self.collection.delete(ids=ids_to_delete)
+        return True, len(ids_to_delete)
+
+    def delete_chunk(self, chunk_id: str) -> bool:
+        all_data = self.collection.get(include=["metadatas"])
+
+        if chunk_id not in all_data["ids"]:
+            return False
+
+        self.collection.delete(ids=[chunk_id])
+        return True