add README.md and test

2025-08-26 16:40:40 +03:00 · 2025-05-03 13:18:27 +02:00 · 2025-05-03 13:18:27 +02:00 · e215876848
commit e215876848
parent b5fcd0489a
3 changed files with 42 additions and 45 deletions
--- a/README.md
+++ b/README.md
@ -31,6 +31,7 @@
        <li><a href="#worker-aware-async-scheduler">Schedule jobs</a></li>
        <li><a href="#smtp-setup">Email Configuration</a></li>
        <li><a href="#uv-knowledge-and-inspirations">UV knowledge and inspirations</a></li> 
        <li><a href="#large-language-model">Integration with local LLM</a></li>  
      </ul>
    </li>
    <li><a href="#acknowledgments">Acknowledgments</a></li>
@ -162,6 +163,24 @@ This service supports plaintext and HTML emails, and also allows sending templat
 It is implemented as a singleton to ensure that only one SMTP connection is maintained
 throughout the application lifecycle, optimizing resource usage.
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 ### Large Language Model
 The `/v1/ml/chat/` endpoint is designed to handle chat-based interactions with the LLM model.
 It accepts a user prompt and streams responses back in real-time.
 The endpoint leverages FastAPI's asynchronous capabilities to efficiently manage multiple simultaneous requests,
 ensuring low latency and high throughput.
 FastAPI's async support is particularly beneficial for reducing I/O bottlenecks when connecting to the LLM model.
 By using asynchronous HTTP clients like `httpx`,
 the application can handle multiple I/O-bound tasks concurrently,
 such as sending requests to the LLM server and streaming responses back to the client.
 This approach minimizes idle time and optimizes resource utilization, making it ideal for high-performance applications.
 Install ollama and run the server
 ```shell
 ollama run llama3.2
 ```
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
@ -215,6 +234,7 @@ I've included a few of my favorites to kick things off!
 - **[DEC 16 2024]** bump project to Python 3.13 :fast_forward:
 - **[JAN 28 2025]** add SMTP setup :email:
 - **[MAR 8 2025]** switch from poetry to uv :fast_forward:
 - **[MAY 3 2025]** add large language model integration :robot:
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -10,7 +10,7 @@ class StreamLLMService:
    async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
        """Stream chat completion responses from LLM."""
-        # Send user message first
+        # Send the user a message first
        user_msg = {
            "role": "user",
            "content": prompt,
--- a/tests/chat.py
+++ b/tests/chat.py
@ -1,53 +1,30 @@
-from typing import Optional, AsyncGenerator
+import anyio
 import httpx
 import orjson
 async def chat_with_endpoint():
    async with httpx.AsyncClient() as client:
        while True:
            # Get user input
            prompt = input("\nYou: ")
            if prompt.lower() == "exit":
                break
-class StreamLLMService:
+            # Send request to the API
-    def __init__(self, base_url: str = "http://localhost:11434/v1"):
+            print("\nModel: ", end="", flush=True)
        self.base_url = base_url
        self.model = "llama3.2"
    async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
        """Stream chat completion responses from LLM."""
        # Send user message first
        user_msg = {
            "role": "user",
            "content": prompt,
        }
        yield orjson.dumps(user_msg) + b"\n"
        # Open client as context manager and stream responses
        async with httpx.AsyncClient(base_url=self.base_url) as client:
            async with client.stream(
                "POST",
-                "/chat/completions",
+                "http://localhost:8000/chat/",
-                json={
+                data={"prompt": prompt},
-                    "model": self.model,
+                timeout=60
                    "messages": [{"role": "user", "content": prompt}],
                    "stream": True,
                },
                timeout=60.0,
            ) as response:
-                async for line in response.aiter_lines():
+                async for chunk in response.aiter_lines():
-                    print(line)
+                    if chunk:
                    if line.startswith("data: ") and line != "data: [DONE]":
                        try:
-                            json_line = line[6:]  # Remove "data: " prefix
+                            data = orjson.loads(chunk)
-                            data = orjson.loads(json_line)
+                            print(data["content"], end="", flush=True)
-                            content = (
+                        except Exception as e:
-                                data.get("choices", [{}])[0]
+                            print(f"\nError parsing chunk: {e}")
                                .get("delta", {})
                                .get("content", "")
                            )
                            if content:
                                model_msg = {"role": "model", "content": content}
                                yield orjson.dumps(model_msg) + b"\n"
                        except Exception:
                            pass
-
+if __name__ == "__main__":
-# FastAPI dependency
+    anyio.run(chat_with_endpoint)
 def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService:
    return StreamLLMService(base_url=base_url or "http://localhost:11434/v1")