add README.md and test

This commit is contained in:
grillazz 2025-05-03 13:18:27 +02:00
parent b5fcd0489a
commit e215876848
3 changed files with 42 additions and 45 deletions

View File

@ -31,6 +31,7 @@
<li><a href="#worker-aware-async-scheduler">Schedule jobs</a></li> <li><a href="#worker-aware-async-scheduler">Schedule jobs</a></li>
<li><a href="#smtp-setup">Email Configuration</a></li> <li><a href="#smtp-setup">Email Configuration</a></li>
<li><a href="#uv-knowledge-and-inspirations">UV knowledge and inspirations</a></li> <li><a href="#uv-knowledge-and-inspirations">UV knowledge and inspirations</a></li>
<li><a href="#large-language-model">Integration with local LLM</a></li>
</ul> </ul>
</li> </li>
<li><a href="#acknowledgments">Acknowledgments</a></li> <li><a href="#acknowledgments">Acknowledgments</a></li>
@ -162,6 +163,24 @@ This service supports plaintext and HTML emails, and also allows sending templat
It is implemented as a singleton to ensure that only one SMTP connection is maintained It is implemented as a singleton to ensure that only one SMTP connection is maintained
throughout the application lifecycle, optimizing resource usage. throughout the application lifecycle, optimizing resource usage.
<p align="right">(<a href="#readme-top">back to top</a>)</p>
### Large Language Model
The `/v1/ml/chat/` endpoint is designed to handle chat-based interactions with the LLM model.
It accepts a user prompt and streams responses back in real-time.
The endpoint leverages FastAPI's asynchronous capabilities to efficiently manage multiple simultaneous requests,
ensuring low latency and high throughput.
FastAPI's async support is particularly beneficial for reducing I/O bottlenecks when connecting to the LLM model.
By using asynchronous HTTP clients like `httpx`,
the application can handle multiple I/O-bound tasks concurrently,
such as sending requests to the LLM server and streaming responses back to the client.
This approach minimizes idle time and optimizes resource utilization, making it ideal for high-performance applications.
Install ollama and run the server
```shell
ollama run llama3.2
```
<p align="right">(<a href="#readme-top">back to top</a>)</p> <p align="right">(<a href="#readme-top">back to top</a>)</p>
@ -215,6 +234,7 @@ I've included a few of my favorites to kick things off!
- **[DEC 16 2024]** bump project to Python 3.13 :fast_forward: - **[DEC 16 2024]** bump project to Python 3.13 :fast_forward:
- **[JAN 28 2025]** add SMTP setup :email: - **[JAN 28 2025]** add SMTP setup :email:
- **[MAR 8 2025]** switch from poetry to uv :fast_forward: - **[MAR 8 2025]** switch from poetry to uv :fast_forward:
- **[MAY 3 2025]** add large language model integration :robot:
<p align="right">(<a href="#readme-top">back to top</a>)</p> <p align="right">(<a href="#readme-top">back to top</a>)</p>

View File

@ -10,7 +10,7 @@ class StreamLLMService:
async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]: async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
"""Stream chat completion responses from LLM.""" """Stream chat completion responses from LLM."""
# Send user message first # Send the user a message first
user_msg = { user_msg = {
"role": "user", "role": "user",
"content": prompt, "content": prompt,

View File

@ -1,53 +1,30 @@
from typing import Optional, AsyncGenerator import anyio
import httpx import httpx
import orjson import orjson
async def chat_with_endpoint():
async with httpx.AsyncClient() as client:
while True:
# Get user input
prompt = input("\nYou: ")
if prompt.lower() == "exit":
break
class StreamLLMService: # Send request to the API
def __init__(self, base_url: str = "http://localhost:11434/v1"): print("\nModel: ", end="", flush=True)
self.base_url = base_url
self.model = "llama3.2"
async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
"""Stream chat completion responses from LLM."""
# Send user message first
user_msg = {
"role": "user",
"content": prompt,
}
yield orjson.dumps(user_msg) + b"\n"
# Open client as context manager and stream responses
async with httpx.AsyncClient(base_url=self.base_url) as client:
async with client.stream( async with client.stream(
"POST", "POST",
"/chat/completions", "http://localhost:8000/chat/",
json={ data={"prompt": prompt},
"model": self.model, timeout=60
"messages": [{"role": "user", "content": prompt}],
"stream": True,
},
timeout=60.0,
) as response: ) as response:
async for line in response.aiter_lines(): async for chunk in response.aiter_lines():
print(line) if chunk:
if line.startswith("data: ") and line != "data: [DONE]":
try: try:
json_line = line[6:] # Remove "data: " prefix data = orjson.loads(chunk)
data = orjson.loads(json_line) print(data["content"], end="", flush=True)
content = ( except Exception as e:
data.get("choices", [{}])[0] print(f"\nError parsing chunk: {e}")
.get("delta", {})
.get("content", "")
)
if content:
model_msg = {"role": "model", "content": content}
yield orjson.dumps(model_msg) + b"\n"
except Exception:
pass
if __name__ == "__main__":
# FastAPI dependency anyio.run(chat_with_endpoint)
def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService:
return StreamLLMService(base_url=base_url or "http://localhost:11434/v1")