From e2158768480658073dd99daa967449d9ac6a9194 Mon Sep 17 00:00:00 2001
From: grillazz <the@grillazz.com>
Date: Sat, 3 May 2025 13:18:27 +0200
Subject: [PATCH] add README.md and test

---
 README.md           | 20 ++++++++++++++
 app/services/llm.py |  2 +-
 tests/chat.py       | 65 +++++++++++++++------------------------------
 3 files changed, 42 insertions(+), 45 deletions(-)
diff --git a/README.md b/README.md
index 8533f45..ed4bb98 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@
         <li><a href="#worker-aware-async-scheduler">Schedule jobs</a></li>
         <li><a href="#smtp-setup">Email Configuration</a></li>
         <li><a href="#uv-knowledge-and-inspirations">UV knowledge and inspirations</a></li> 
+        <li><a href="#large-language-model">Integration with local LLM</a></li>  
       </ul>
     </li>
     <li><a href="#acknowledgments">Acknowledgments</a></li>
@@ -162,6 +163,24 @@ This service supports plaintext and HTML emails, and also allows sending templat
 It is implemented as a singleton to ensure that only one SMTP connection is maintained
 throughout the application lifecycle, optimizing resource usage.
 
+<p align="right">(<a href="#readme-top">back to top</a>)</p>
+
+### Large Language Model
+The `/v1/ml/chat/` endpoint is designed to handle chat-based interactions with the LLM model.
+It accepts a user prompt and streams responses back in real-time.
+The endpoint leverages FastAPI's asynchronous capabilities to efficiently manage multiple simultaneous requests,
+ensuring low latency and high throughput.
+
+FastAPI's async support is particularly beneficial for reducing I/O bottlenecks when connecting to the LLM model.
+By using asynchronous HTTP clients like `httpx`,
+the application can handle multiple I/O-bound tasks concurrently,
+such as sending requests to the LLM server and streaming responses back to the client.
+This approach minimizes idle time and optimizes resource utilization, making it ideal for high-performance applications.
+
+Install ollama and run the server
+```shell
+ollama run llama3.2
+```
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
@@ -215,6 +234,7 @@ I've included a few of my favorites to kick things off!
 - **[DEC 16 2024]** bump project to Python 3.13 :fast_forward:
 - **[JAN 28 2025]** add SMTP setup :email:
 - **[MAR 8 2025]** switch from poetry to uv :fast_forward:
+- **[MAY 3 2025]** add large language model integration :robot:
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
diff --git a/app/services/llm.py b/app/services/llm.py
index db29c28..2b28ad7 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -10,7 +10,7 @@ class StreamLLMService:
 
     async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
         """Stream chat completion responses from LLM."""
-        # Send user message first
+        # Send the user a message first
         user_msg = {
             "role": "user",
             "content": prompt,
diff --git a/tests/chat.py b/tests/chat.py
index 25bfa2f..a231c6c 100644
--- a/tests/chat.py
+++ b/tests/chat.py
@@ -1,53 +1,30 @@
-from typing import Optional, AsyncGenerator
-
+import anyio
 import httpx
 import orjson
 
+async def chat_with_endpoint():
+    async with httpx.AsyncClient() as client:
+        while True:
+            # Get user input
+            prompt = input("\nYou: ")
+            if prompt.lower() == "exit":
+                break
 
-class StreamLLMService:
-    def __init__(self, base_url: str = "http://localhost:11434/v1"):
-        self.base_url = base_url
-        self.model = "llama3.2"
-
-    async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
-        """Stream chat completion responses from LLM."""
-        # Send user message first
-        user_msg = {
-            "role": "user",
-            "content": prompt,
-        }
-        yield orjson.dumps(user_msg) + b"\n"
-
-        # Open client as context manager and stream responses
-        async with httpx.AsyncClient(base_url=self.base_url) as client:
+            # Send request to the API
+            print("\nModel: ", end="", flush=True)
             async with client.stream(
                 "POST",
-                "/chat/completions",
-                json={
-                    "model": self.model,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "stream": True,
-                },
-                timeout=60.0,
+                "http://localhost:8000/chat/",
+                data={"prompt": prompt},
+                timeout=60
             ) as response:
-                async for line in response.aiter_lines():
-                    print(line)
-                    if line.startswith("data: ") and line != "data: [DONE]":
+                async for chunk in response.aiter_lines():
+                    if chunk:
                         try:
-                            json_line = line[6:]  # Remove "data: " prefix
-                            data = orjson.loads(json_line)
-                            content = (
-                                data.get("choices", [{}])[0]
-                                .get("delta", {})
-                                .get("content", "")
-                            )
-                            if content:
-                                model_msg = {"role": "model", "content": content}
-                                yield orjson.dumps(model_msg) + b"\n"
-                        except Exception:
-                            pass
+                            data = orjson.loads(chunk)
+                            print(data["content"], end="", flush=True)
+                        except Exception as e:
+                            print(f"\nError parsing chunk: {e}")
 
-
-# FastAPI dependency
-def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService:
-    return StreamLLMService(base_url=base_url or "http://localhost:11434/v1")
+if __name__ == "__main__":
+    anyio.run(chat_with_endpoint)