feat(backend): add documents CRUD service methods and Pydantic schemas
Add list_documents(), list_chunks(), delete_document(), delete_chunk() to RAGService for ChromaDB document management. New schemas: DocumentInfo, ChunkInfo, DocumentListResponse, DeleteResponse. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
52c09b86cb
commit
178461915a
|
|
@ -0,0 +1,29 @@
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentInfo(BaseModel):
|
||||||
|
document_id: str
|
||||||
|
filename: str
|
||||||
|
chunk_count: int
|
||||||
|
upload_date: str
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkInfo(BaseModel):
|
||||||
|
chunk_id: str
|
||||||
|
chunk_index: int
|
||||||
|
content_summary: str
|
||||||
|
page_number: Optional[int] = None
|
||||||
|
chunk_file_path: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentListResponse(BaseModel):
|
||||||
|
documents: List[DocumentInfo]
|
||||||
|
total_documents: int
|
||||||
|
total_chunks: int
|
||||||
|
|
||||||
|
|
||||||
|
class DeleteResponse(BaseModel):
|
||||||
|
deleted: bool
|
||||||
|
message: str
|
||||||
|
|
@ -113,3 +113,74 @@ class RAGService:
|
||||||
)
|
)
|
||||||
|
|
||||||
return await self.llm_client.complete(prompt=prompt, temperature=0.3, step_name="ResponseGeneration")
|
return await self.llm_client.complete(prompt=prompt, temperature=0.3, step_name="ResponseGeneration")
|
||||||
|
|
||||||
|
def list_documents(self) -> Tuple[List[Dict[str, Any]], int, int]:
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
all_data = self.collection.get(include=["metadatas"])
|
||||||
|
|
||||||
|
if not all_data["metadatas"]:
|
||||||
|
return [], 0, 0
|
||||||
|
|
||||||
|
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
|
||||||
|
|
||||||
|
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
|
||||||
|
parts = chunk_id.rsplit("_", 1)
|
||||||
|
doc_id = parts[0] if len(parts) == 2 else chunk_id
|
||||||
|
|
||||||
|
docs[doc_id]["filename"] = meta.get("filename", "unknown")
|
||||||
|
docs[doc_id]["chunk_count"] += 1
|
||||||
|
docs[doc_id]["upload_date"] = meta.get("upload_date", "")
|
||||||
|
|
||||||
|
total_chunks = sum(d["chunk_count"] for d in docs.values())
|
||||||
|
doc_list = [
|
||||||
|
{
|
||||||
|
"document_id": doc_id,
|
||||||
|
"filename": info["filename"],
|
||||||
|
"chunk_count": info["chunk_count"],
|
||||||
|
"upload_date": info["upload_date"],
|
||||||
|
}
|
||||||
|
for doc_id, info in docs.items()
|
||||||
|
]
|
||||||
|
|
||||||
|
return doc_list, len(doc_list), total_chunks
|
||||||
|
|
||||||
|
def list_chunks(self, document_id: str) -> List[Dict[str, Any]]:
|
||||||
|
all_data = self.collection.get(include=["metadatas"])
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
|
||||||
|
if chunk_id.startswith(f"{document_id}_"):
|
||||||
|
chunks.append({
|
||||||
|
"chunk_id": chunk_id,
|
||||||
|
"chunk_index": meta.get("chunk_index", 0),
|
||||||
|
"content_summary": meta.get("content_summary", ""),
|
||||||
|
"page_number": meta.get("page_number"),
|
||||||
|
"chunk_file_path": meta.get("chunk_file_path"),
|
||||||
|
})
|
||||||
|
|
||||||
|
chunks.sort(key=lambda x: x["chunk_index"])
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def delete_document(self, document_id: str) -> Tuple[bool, int]:
|
||||||
|
all_data = self.collection.get(include=["metadatas"])
|
||||||
|
|
||||||
|
ids_to_delete = [
|
||||||
|
chunk_id for chunk_id in all_data["ids"]
|
||||||
|
if chunk_id.startswith(f"{document_id}_")
|
||||||
|
]
|
||||||
|
|
||||||
|
if not ids_to_delete:
|
||||||
|
return False, 0
|
||||||
|
|
||||||
|
self.collection.delete(ids=ids_to_delete)
|
||||||
|
return True, len(ids_to_delete)
|
||||||
|
|
||||||
|
def delete_chunk(self, chunk_id: str) -> bool:
|
||||||
|
all_data = self.collection.get(include=["metadatas"])
|
||||||
|
|
||||||
|
if chunk_id not in all_data["ids"]:
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.collection.delete(ids=[chunk_id])
|
||||||
|
return True
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue