feat(backend): add documents CRUD service methods and Pydantic schemas

Add list_documents(), list_chunks(), delete_document(), delete_chunk() to RAGService for ChromaDB document management. New schemas: DocumentInfo, ChunkInfo, DocumentListResponse, DeleteResponse.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-23 19:02:07 +08:00
parent 52c09b86cb
commit 178461915a
2 changed files with 100 additions and 0 deletions

View File

@ -0,0 +1,29 @@
from typing import List, Optional
from pydantic import BaseModel
class DocumentInfo(BaseModel):
document_id: str
filename: str
chunk_count: int
upload_date: str
class ChunkInfo(BaseModel):
chunk_id: str
chunk_index: int
content_summary: str
page_number: Optional[int] = None
chunk_file_path: Optional[str] = None
class DocumentListResponse(BaseModel):
documents: List[DocumentInfo]
total_documents: int
total_chunks: int
class DeleteResponse(BaseModel):
deleted: bool
message: str

View File

@ -113,3 +113,74 @@ class RAGService:
) )
return await self.llm_client.complete(prompt=prompt, temperature=0.3, step_name="ResponseGeneration") return await self.llm_client.complete(prompt=prompt, temperature=0.3, step_name="ResponseGeneration")
def list_documents(self) -> Tuple[List[Dict[str, Any]], int, int]:
from collections import defaultdict
all_data = self.collection.get(include=["metadatas"])
if not all_data["metadatas"]:
return [], 0, 0
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
parts = chunk_id.rsplit("_", 1)
doc_id = parts[0] if len(parts) == 2 else chunk_id
docs[doc_id]["filename"] = meta.get("filename", "unknown")
docs[doc_id]["chunk_count"] += 1
docs[doc_id]["upload_date"] = meta.get("upload_date", "")
total_chunks = sum(d["chunk_count"] for d in docs.values())
doc_list = [
{
"document_id": doc_id,
"filename": info["filename"],
"chunk_count": info["chunk_count"],
"upload_date": info["upload_date"],
}
for doc_id, info in docs.items()
]
return doc_list, len(doc_list), total_chunks
def list_chunks(self, document_id: str) -> List[Dict[str, Any]]:
all_data = self.collection.get(include=["metadatas"])
chunks = []
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
if chunk_id.startswith(f"{document_id}_"):
chunks.append({
"chunk_id": chunk_id,
"chunk_index": meta.get("chunk_index", 0),
"content_summary": meta.get("content_summary", ""),
"page_number": meta.get("page_number"),
"chunk_file_path": meta.get("chunk_file_path"),
})
chunks.sort(key=lambda x: x["chunk_index"])
return chunks
def delete_document(self, document_id: str) -> Tuple[bool, int]:
all_data = self.collection.get(include=["metadatas"])
ids_to_delete = [
chunk_id for chunk_id in all_data["ids"]
if chunk_id.startswith(f"{document_id}_")
]
if not ids_to_delete:
return False, 0
self.collection.delete(ids=ids_to_delete)
return True, len(ids_to_delete)
def delete_chunk(self, chunk_id: str) -> bool:
all_data = self.collection.get(include=["metadatas"])
if chunk_id not in all_data["ids"]:
return False
self.collection.delete(ids=[chunk_id])
return True