diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index b0b8154..7a5a720 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -140,10 +140,10 @@ class LLMClient: async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float): """Use vLLM's native guided_json via extra_body for structured output.""" schema = pydantic_model.model_json_schema() + prompt_preview = self._truncate_prompt_for_log(prompt, first_chars=300, last_chars=100) self.logger.info( - "[%s] vLLM structured: sending guided_json schema=%s", - step_name, - json.dumps(schema)[:300], + "[%s] vLLM structured: prompt=%s schema=%s", + step_name, prompt_preview, json.dumps(schema)[:300], ) # Try the new unified format first, then legacy guided_json @@ -152,18 +152,22 @@ class LLMClient: ("guided_json", {"guided_json": schema}), ]: try: - self.logger.info("[%s] vLLM structured: trying format=%s", step_name, fmt_name) + self.logger.info("[%s] vLLM structured: trying format=%s extra=%s", step_name, fmt_name, extra) response = await self._client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], temperature=0.0, extra_body=extra, ) + self.logger.info("[%s] vLLM structured full response: %s", step_name, response) content = response.choices[0].message.content or "" elapsed_ms = (time.perf_counter() - start_time) * 1000 self.logger.info( - "[%s] vLLM structured succeeded with format=%s in %.2fms. Response: %s", - step_name, fmt_name, elapsed_ms, content[:200], + "[%s] vLLM structured succeeded format=%s in %.2fms tokens=(%s/%s). Parsed=%s", + step_name, fmt_name, elapsed_ms, + response.usage.prompt_tokens if response.usage else "?", + response.usage.completion_tokens if response.usage else "?", + content, ) return pydantic_model.model_validate_json(content) except Exception as exc: @@ -171,6 +175,7 @@ class LLMClient: self.logger.warning( "[%s] vLLM structured format=%s failed after %.2fms: %s", step_name, fmt_name, elapsed_ms, exc, + exc_info=True, ) elapsed_ms = (time.perf_counter() - start_time) * 1000