fastapi-sqlalchemy-asyncpg/app/services/llm.py

import httpx
import orjson
from typing import AsyncGenerator, Optional


class StreamLLMService:
    def __init__(self, base_url: str = "http://localhost:11434/v1"):
        self.base_url = base_url
        self.model = "llama3.2"

    async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
        """Stream chat completion responses from LLM."""
        # Send initial user message
        yield orjson.dumps({"role": "user", "content": prompt}) + b"\n"

        async with httpx.AsyncClient(base_url=self.base_url) as client:
            request_data = {
                "model": self.model,
                "messages": [{"role": "user", "content": prompt}],
                "stream": True,
            }

            async with client.stream(
                "POST", "/chat/completions", json=request_data, timeout=60.0
            ) as response:
                async for line in response.aiter_lines():
                    if not (line.startswith("data: ") and line != "data: [DONE]"):
                        continue
                    try:
                        data = orjson.loads(line[6:])  # Skip "data: " prefix
                        if (
                            content := data.get("choices", [{}])[0]
                            .get("delta", {})
                            .get("content", "")
                        ):
                            yield (
                                orjson.dumps({"role": "model", "content": content})
                                + b"\n"
                            )
                    except Exception:
                        pass


def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService:
    return StreamLLMService(base_url=base_url)