diff --git a/app/services/llm.py b/app/services/llm.py new file mode 100644 index 0000000..3d6ed34 --- /dev/null +++ b/app/services/llm.py @@ -0,0 +1,51 @@ +import httpx +import orjson +from typing import AsyncGenerator, Optional + + +class StreamLLMService: + def __init__(self, base_url: str = "http://localhost:11434/v1"): + self.base_url = base_url + self.model = "llama3.2" + + async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]: + """Stream chat completion responses from LLM.""" + # Send user message first + user_msg = { + 'role': 'user', + 'content': prompt, + } + yield orjson.dumps(user_msg) + b'\n' + + # Open client as context manager and stream responses + async with httpx.AsyncClient(base_url=self.base_url) as client: + async with client.stream( + "POST", + "/chat/completions", + json={ + "model": self.model, + "messages": [{"role": "user", "content": prompt}], + "stream": True + }, + timeout=60.0 + ) as response: + async for line in response.aiter_lines(): + print(line) + if line.startswith("data: ") and line != "data: [DONE]": + try: + json_line = line[6:] # Remove "data: " prefix + data = orjson.loads(json_line) + content = data.get("choices", [{}])[0].get("delta", {}).get("content", "") + if content: + model_msg = { + 'role': 'model', + 'content': content + } + yield orjson.dumps(model_msg) + b'\n' + except Exception: + pass + + +# FastAPI dependency +def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService: + return StreamLLMService(base_url=base_url) \ No newline at end of file