feat(backend): add LLM monitoring with step names, timing, and prompt logging

- LLMClient.complete() now accepts step_name parameter to identify processing step

- Logs prompt preview (first 100 + last 100 chars) at INFO level

- Logs processing time in milliseconds with token usage stats

- Updated QueryDecomposer, RelevanceFilter, and RAGService to pass step names

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-23 14:51:57 +08:00
parent 74cb8b83d5
commit f5cfe44183
4 changed files with 39 additions and 8 deletions

View File

@ -1,4 +1,5 @@
import logging
import time
from typing import Optional
import httpx
@ -28,11 +29,30 @@ class LLMClient:
),
)
async def complete(self, prompt: str, temperature: float = 0.7) -> str:
"""Send a chat completion request with optional thinking control."""
def _truncate_prompt_for_log(self, prompt: str, first_chars: int = 100, last_chars: int = 100) -> str:
"""Truncate prompt for logging: show first N and last N chars with ellipsis."""
if len(prompt) <= first_chars + last_chars:
return prompt
return f"{prompt[:first_chars]}...({len(prompt) - first_chars - last_chars} chars omitted)...{prompt[-last_chars:]}"
async def complete(self, prompt: str, temperature: float = 0.7, step_name: str = "LLM") -> str:
"""Send a chat completion request with optional thinking control.
Args:
prompt: The prompt to send to the LLM.
temperature: Sampling temperature.
step_name: Identifier for the processing step (e.g., "QueryDecomposer").
Returns:
The LLM response text.
"""
messages = [{"role": "user", "content": prompt}]
extra_body = self._build_extra_body()
prompt_preview = self._truncate_prompt_for_log(prompt)
self.logger.info("[%s] LLM request started. Prompt: %s", step_name, prompt_preview)
start_time = time.perf_counter()
try:
response = await self._client.chat.completions.create(
model=self.model,
@ -40,12 +60,23 @@ class LLMClient:
temperature=temperature,
extra_body=extra_body if extra_body else None,
)
return response.choices[0].message.content or ""
content = response.choices[0].message.content or ""
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.info(
"[%s] LLM request completed in %.2fms (prompt_tokens=%s, completion_tokens=%s)",
step_name,
elapsed_ms,
response.usage.prompt_tokens if response.usage else "?",
response.usage.completion_tokens if response.usage else "?",
)
return content
except (APITimeoutError, APIError) as exc:
self.logger.error("LLM API error: %s", exc)
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.error("[%s] LLM API error after %.2fms: %s", step_name, elapsed_ms, exc)
raise LLMClientError from exc
except Exception as exc:
self.logger.error("Unexpected LLM error: %s", exc)
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.error("[%s] Unexpected LLM error after %.2fms: %s", step_name, elapsed_ms, exc)
raise LLMClientError from exc
def _build_extra_body(self) -> dict:

View File

@ -43,7 +43,7 @@ class QueryDecomposer:
prompt = f"Given question: '{question}', extract key search keywords as JSON array"
try:
response = await self.llm_client.complete(prompt)
response = await self.llm_client.complete(prompt, step_name="QueryDecomposer")
except Exception as exc:
logger.warning("LLM decomposition failed: %s", exc)
return []

View File

@ -112,4 +112,4 @@ class RAGService:
f"Answer:"
)
return await self.llm_client.complete(prompt=prompt, temperature=0.3)
return await self.llm_client.complete(prompt=prompt, temperature=0.3, step_name="ResponseGeneration")

View File

@ -36,7 +36,7 @@ class RelevanceFilter:
prompt = self._build_prompt(question, chunks)
try:
response = await self.llm_client.complete(prompt, temperature=0.0)
response = await self.llm_client.complete(prompt, temperature=0.0, step_name="RelevanceFilter")
except Exception as exc:
logger.error("RelevanceFilter LLM call failed: %s", exc)
return []