feat: inject Pydantic JSON schema into Deepseek prompt (Phase 6)

Follows Deepseek JSON Output guide: the prompt now includes the word 'json' and a format example derived from the Pydantic model schema. Added _pydantic_to_json_instruction() helper that builds a human-readable schema description with EXAMPLE JSON OUTPUT. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-05-04 15:17:24 +08:00 · 2026-05-04 15:17:24 +08:00 · df62283f58
parent 226f4ed700
commit df62283f58
2 changed files with 61 additions and 5 deletions
--- a/backend/app/services/llm_client_dp.py
+++ b/backend/app/services/llm_client_dp.py
@ -39,6 +39,53 @@ def _truncate_prompt_for_log(prompt: str, first_chars: int = 100, last_chars: in
    )
 def _pydantic_to_json_instruction(model: Any) -> str:
    """Build a JSON-format instruction from a Pydantic model's schema.
    Follows the Deepseek JSON Output guide: the prompt must contain the word
    "json" and an example of the expected shape.  The model schema is
    converted into a human-readable text description with a filled-in example.
    """
    schema = model.model_json_schema()
    props = schema.get("properties", {})
    title = schema.get("title", model.__name__)
    parts: list[str] = []
    parts.append(f"Output the result in JSON format as a {title} object.")
    # Build an example by filling each field with a representative value.
    example: dict[str, Any] = {}
    for name, info in props.items():
        t = info.get("type", "any")
        desc = info.get("description", "")
        if t == "array":
            items = info.get("items", {})
            item_type = items.get("type", "string")
            min_items = info.get("minItems", 1)
            parts.append(
                f'- "{name}": array of {item_type} '
                f"(min {min_items}"
                + (f", max {info['maxItems']}" if info.get("maxItems") else "")
                + f") — {desc}"
            )
            example[name] = [f"<{item_type}_1>", f"<{item_type}_2>"]
        elif t == "string":
            parts.append(f'- "{name}": {t} — {desc}')
            example[name] = f"<{desc[:40]}>"
        elif t == "integer" or t == "number":
            parts.append(f'- "{name}": {t} — {desc}')
            example[name] = 0
        else:
            parts.append(f'- "{name}": {t} — {desc}')
            example[name] = f"<{t}>"
    parts.append("")
    parts.append("EXAMPLE JSON OUTPUT:")
    parts.append(json.dumps(example, indent=2, ensure_ascii=False))
    return "\n".join(parts)
 class LLMClientDP:
    """Async Deepseek API client for query decomposition.
@ -126,17 +173,21 @@ class LLMClientDP:
        Deepseek supports ``response_format={"type": "json_object"}`` (which
        guarantees valid JSON) but not OpenAI's ``json_schema`` mode (which
-        would validate against a specific schema).  We use the JSON mode to
+        would validate against a specific schema).  We inject a JSON format
-        get a guaranteed-valid JSON response, then validate it client-side
+        instruction derived from *pydantic_model* into the prompt (per the
-        against *pydantic_model*.
+        Deepseek JSON Output guide), then validate client-side.
        """
        prompt_preview = _truncate_prompt_for_log(prompt, first_chars=300, last_chars=100)
        logger.info("[%s] Deepseek structured request started. Prompt: %s", step_name, prompt_preview)
        start_time = time.perf_counter()
        # Inject JSON format instruction from the Pydantic model.
        json_instruction = _pydantic_to_json_instruction(pydantic_model)
        full_prompt = f"{prompt}\n\n{json_instruction}"
        try:
            response = await self.complete(
-                prompt=prompt,
+                prompt=full_prompt,
                temperature=0.0,
                step_name=step_name,
                response_format={"type": "json_object"},
--- a/backend/app/test/test_phase6_llm_client_dp.py
+++ b/backend/app/test/test_phase6_llm_client_dp.py
@ -191,11 +191,16 @@ class TestLLMClientDPCompleteStructured:
        assert result == expected
        assert result.questions == ["Q1", "Q2", "Q3"]
        mock_complete.assert_called_once()
        # Verify Deepseek JSON mode is used
        call_kwargs = mock_complete.call_args.kwargs
        assert call_kwargs["response_format"] == {"type": "json_object"}
        assert call_kwargs["temperature"] == 0.0
        # The prompt must contain the JSON format instruction per Deepseek docs.
        full_prompt = call_kwargs["prompt"]
        assert "json" in full_prompt.lower()
        assert "EXAMPLE JSON OUTPUT" in full_prompt
        assert '"questions"' in full_prompt
    @pytest.mark.asyncio
    async def test_complete_structured_with_markdown_fence(self, client):
        """Should strip markdown code fences before JSON parsing."""