diff --git a/README.md b/README.md
index 8533f45..ed4bb98 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@
Schedule jobs
Email Configuration
UV knowledge and inspirations
+ Integration with local LLM
Acknowledgments
@@ -162,6 +163,24 @@ This service supports plaintext and HTML emails, and also allows sending templat
It is implemented as a singleton to ensure that only one SMTP connection is maintained
throughout the application lifecycle, optimizing resource usage.
+(back to top)
+
+### Large Language Model
+The `/v1/ml/chat/` endpoint is designed to handle chat-based interactions with the LLM model.
+It accepts a user prompt and streams responses back in real-time.
+The endpoint leverages FastAPI's asynchronous capabilities to efficiently manage multiple simultaneous requests,
+ensuring low latency and high throughput.
+
+FastAPI's async support is particularly beneficial for reducing I/O bottlenecks when connecting to the LLM model.
+By using asynchronous HTTP clients like `httpx`,
+the application can handle multiple I/O-bound tasks concurrently,
+such as sending requests to the LLM server and streaming responses back to the client.
+This approach minimizes idle time and optimizes resource utilization, making it ideal for high-performance applications.
+
+Install ollama and run the server
+```shell
+ollama run llama3.2
+```
(back to top)
@@ -215,6 +234,7 @@ I've included a few of my favorites to kick things off!
- **[DEC 16 2024]** bump project to Python 3.13 :fast_forward:
- **[JAN 28 2025]** add SMTP setup :email:
- **[MAR 8 2025]** switch from poetry to uv :fast_forward:
+- **[MAY 3 2025]** add large language model integration :robot:
(back to top)
diff --git a/app/services/llm.py b/app/services/llm.py
index db29c28..2b28ad7 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -10,7 +10,7 @@ class StreamLLMService:
async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
"""Stream chat completion responses from LLM."""
- # Send user message first
+ # Send the user a message first
user_msg = {
"role": "user",
"content": prompt,
diff --git a/tests/chat.py b/tests/chat.py
index 25bfa2f..a231c6c 100644
--- a/tests/chat.py
+++ b/tests/chat.py
@@ -1,53 +1,30 @@
-from typing import Optional, AsyncGenerator
-
+import anyio
import httpx
import orjson
+async def chat_with_endpoint():
+ async with httpx.AsyncClient() as client:
+ while True:
+ # Get user input
+ prompt = input("\nYou: ")
+ if prompt.lower() == "exit":
+ break
-class StreamLLMService:
- def __init__(self, base_url: str = "http://localhost:11434/v1"):
- self.base_url = base_url
- self.model = "llama3.2"
-
- async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
- """Stream chat completion responses from LLM."""
- # Send user message first
- user_msg = {
- "role": "user",
- "content": prompt,
- }
- yield orjson.dumps(user_msg) + b"\n"
-
- # Open client as context manager and stream responses
- async with httpx.AsyncClient(base_url=self.base_url) as client:
+ # Send request to the API
+ print("\nModel: ", end="", flush=True)
async with client.stream(
"POST",
- "/chat/completions",
- json={
- "model": self.model,
- "messages": [{"role": "user", "content": prompt}],
- "stream": True,
- },
- timeout=60.0,
+ "http://localhost:8000/chat/",
+ data={"prompt": prompt},
+ timeout=60
) as response:
- async for line in response.aiter_lines():
- print(line)
- if line.startswith("data: ") and line != "data: [DONE]":
+ async for chunk in response.aiter_lines():
+ if chunk:
try:
- json_line = line[6:] # Remove "data: " prefix
- data = orjson.loads(json_line)
- content = (
- data.get("choices", [{}])[0]
- .get("delta", {})
- .get("content", "")
- )
- if content:
- model_msg = {"role": "model", "content": content}
- yield orjson.dumps(model_msg) + b"\n"
- except Exception:
- pass
+ data = orjson.loads(chunk)
+ print(data["content"], end="", flush=True)
+ except Exception as e:
+ print(f"\nError parsing chunk: {e}")
-
-# FastAPI dependency
-def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService:
- return StreamLLMService(base_url=base_url or "http://localhost:11434/v1")
+if __name__ == "__main__":
+ anyio.run(chat_with_endpoint)