add speech to text command (#6)

This commit is contained in:
Dmitry Afanasyev 2023-09-26 05:30:27 +03:00 committed by GitHub
parent 29355722fa
commit 9aca4c2aeb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 150 additions and 68 deletions

View File

@ -46,16 +46,21 @@ methods:
## Chat: ## Chat:
```shell ```shell
cd chat_gpt_microservice cd bot_microservice
python3 run.py python3 run.py
``` ```
```bash ```bash
cd chat_gpt_microservice cd bot_microservice
poetry run uvicorn --host 0.0.0.0 --factory run:create_app --port 1338 --reload poetry run uvicorn --host 0.0.0.0 --factory run:create_app --port 1338 --reload
``` ```
```bash
cd bot_microservice
gunicorn main:create_app --workers 10 --bind 0.0.0.0:8083 --worker-class uvicorn.workers.UvicornWorker --timeout 150 --max-requests 2000 --max-requests-jitter 400
```
## Tests ## Tests

View File

@ -1,7 +1,7 @@
from fastapi import APIRouter, Request from fastapi import APIRouter, Request
from fastapi.responses import ORJSONResponse
from settings.config import get_settings from settings.config import get_settings
from starlette import status from starlette import status
from starlette.responses import Response
router = APIRouter() router = APIRouter()
settings = get_settings() settings = get_settings()
@ -10,10 +10,10 @@ settings = get_settings()
@router.post( @router.post(
f"/{settings.TELEGRAM_API_TOKEN}", f"/{settings.TELEGRAM_API_TOKEN}",
name="system:process_bot_updates", name="system:process_bot_updates",
response_class=Response,
status_code=status.HTTP_202_ACCEPTED, status_code=status.HTTP_202_ACCEPTED,
summary="process bot updates", summary="process bot updates",
include_in_schema=False, include_in_schema=False,
) )
async def process_bot_updates(request: Request) -> ORJSONResponse: async def process_bot_updates(request: Request) -> None:
await request.app.state.queue.put_updates_on_queue(request) await request.app.state.queue.put_updates_on_queue(request)
return ORJSONResponse(content=None, status_code=status.HTTP_202_ACCEPTED)

View File

@ -1,5 +1,7 @@
from enum import StrEnum from enum import StrEnum
AUDIO_SEGMENT_DURATION = 120 * 1000
API_PREFIX = "/api" API_PREFIX = "/api"
CHAT_GPT_BASE_URL = "http://chat_service:1338/backend-api/v2/conversation" CHAT_GPT_BASE_URL = "http://chat_service:1338/backend-api/v2/conversation"

View File

@ -29,13 +29,15 @@ class BotApplication:
self._add_handlers() self._add_handlers()
async def set_webhook(self) -> None: async def set_webhook(self) -> None:
await self.application.initialize() _, webhook_info = await asyncio.gather(self.application.initialize(), self.application.bot.get_webhook_info())
await self.application.bot.set_webhook(url=self.webhook_url) if not webhook_info.url:
logger.info('webhook is set') await self.application.bot.set_webhook(url=self.webhook_url)
webhook_info = await self.application.bot.get_webhook_info()
logger.info("webhook is set", ip_address=webhook_info.ip_address)
async def delete_webhook(self) -> None: async def delete_webhook(self) -> None:
await self.application.bot.delete_webhook() if await self.application.bot.delete_webhook():
logger.info('webhook has been deleted') logger.info("webhook has been deleted")
async def polling(self) -> None: async def polling(self) -> None:
await self.application.initialize() await self.application.initialize()
@ -73,5 +75,5 @@ class BotQueue:
async def get_updates_from_queue(self) -> None: async def get_updates_from_queue(self) -> None:
while True: while True:
update = await self.queue.get() update = await self.queue.get()
await self.bot_app.application.process_update(update) asyncio.create_task(self.bot_app.application.process_update(update))
await sleep(0) await sleep(0)

View File

@ -1,10 +1,11 @@
import asyncio
import random import random
import tempfile import tempfile
from uuid import uuid4 from uuid import uuid4
import httpx import httpx
from constants import CHAT_GPT_BASE_URL from constants import CHAT_GPT_BASE_URL
from core.utils import convert_file_to_wav from core.utils import SpeechToTextService
from httpx import AsyncClient, AsyncHTTPTransport from httpx import AsyncClient, AsyncHTTPTransport
from loguru import logger from loguru import logger
from telegram import Update from telegram import Update
@ -14,19 +15,20 @@ from telegram.ext import ContextTypes
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Send a message when the command /help is issued.""" """Send a message when the command /help is issued."""
if update.message: if not update.message:
await update.message.reply_text( return None
"Help!", await update.message.reply_text(
disable_notification=True, "Help!",
api_kwargs={"text": "Hello World"}, disable_notification=True,
) api_kwargs={"text": "Hello World"},
return None )
async def ask_question(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: async def ask_question(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
await update.message.reply_text( # type: ignore[union-attr] if not update.message:
"Пожалуйста подождите, ответ в среднем занимает 10-15 секунд" return None
)
await update.message.reply_text("Пожалуйста подождите, ответ в среднем занимает 10-15 секунд")
chat_gpt_request = { chat_gpt_request = {
"conversation_id": str(uuid4()), "conversation_id": str(uuid4()),
@ -39,36 +41,51 @@ async def ask_question(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
"conversation": [], "conversation": [],
"internet_access": False, "internet_access": False,
"content_type": "text", "content_type": "text",
"parts": [{"content": update.message.text, "role": "user"}], # type: ignore[union-attr] "parts": [{"content": update.message.text, "role": "user"}],
}, },
}, },
} }
transport = AsyncHTTPTransport(retries=1) transport = AsyncHTTPTransport(retries=3)
async with AsyncClient(transport=transport) as client: async with AsyncClient(transport=transport, timeout=50) as client:
try: try:
response = await client.post(CHAT_GPT_BASE_URL, json=chat_gpt_request) response = await client.post(CHAT_GPT_BASE_URL, json=chat_gpt_request, timeout=50)
status = response.status_code status = response.status_code
if status != httpx.codes.OK: if status != httpx.codes.OK:
logger.info(f'got response status: {status} from chat api', data=chat_gpt_request) logger.info(f'got response status: {status} from chat api', data=chat_gpt_request)
await update.message.reply_text( # type: ignore[union-attr] await update.message.reply_text(
"Что-то пошло не так, попробуйте еще раз или обратитесь к администратору" "Что-то пошло не так, попробуйте еще раз или обратитесь к администратору"
) )
return return
data = response.json() await update.message.reply_text(response.text)
await update.message.reply_text(data) # type: ignore[union-attr]
except Exception as error: except Exception as error:
logger.error("error get data from chat api", error=error) logger.error("error get data from chat api", error=error)
await update.message.reply_text("Вообще всё сломалось :(") # type: ignore[union-attr] await update.message.reply_text("Вообще всё сломалось :(")
async def voice_recognize(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: async def voice_recognize(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
await update.message.reply_text( # type: ignore[union-attr] if not update.message:
"Пожалуйста, ожидайте :)\nТрехминутная запись обрабатывается примерно 30 секунд" return None
) await update.message.reply_text("Пожалуйста, ожидайте :)\nТрехминутная запись обрабатывается примерно 30 секунд")
sound_bytes = await update.message.voice.get_file() # type: ignore[union-attr] if not update.message.voice:
sound_bytes = await sound_bytes.download_as_bytearray() return None
sound_file = await update.message.voice.get_file()
sound_bytes = await sound_file.download_as_bytearray()
with tempfile.NamedTemporaryFile(delete=False) as tmpfile: with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
tmpfile.write(sound_bytes) tmpfile.write(sound_bytes)
convert_file_to_wav(tmpfile.name)
logger.info('file has been saved', filename=tmpfile.name)
speech_to_text_service = SpeechToTextService(filename=tmpfile.name)
speech_to_text_service.get_text_from_audio()
part = 0
while speech_to_text_service.text_parts or not speech_to_text_service.text_recognised:
if text := speech_to_text_service.text_parts.get(part):
speech_to_text_service.text_parts.pop(part)
await update.message.reply_text(text)
part += 1
await asyncio.sleep(5)

View File

@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
from constants import LogLevelEnum from constants import LogLevelEnum
from loguru import logger from loguru import logger
from sentry_sdk.integrations.logging import EventHandler from sentry_sdk.integrations.logging import EventHandler
from settings.config import get_settings
if TYPE_CHECKING: if TYPE_CHECKING:
from loguru import Record from loguru import Record
@ -13,6 +14,9 @@ else:
Record = dict[str, Any] Record = dict[str, Any]
settings = get_settings()
class InterceptHandler(logging.Handler): class InterceptHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None: def emit(self, record: logging.LogRecord) -> None:
# Get corresponding Loguru level if it exists # Get corresponding Loguru level if it exists
@ -29,7 +33,7 @@ class InterceptHandler(logging.Handler):
logger.opt(depth=depth, exception=record.exc_info).log( logger.opt(depth=depth, exception=record.exc_info).log(
level, level,
record.getMessage(), record.getMessage().replace(settings.TELEGRAM_API_TOKEN, "TELEGRAM_API_TOKEN".center(24, '*')),
) )
@ -97,6 +101,3 @@ def _text_formatter(record: Record) -> str:
format_ += "{exception}\n" format_ += "{exception}\n"
return format_ return format_
configure_logging(level=LogLevelEnum.DEBUG, enable_json_logs=True, enable_sentry_logs=True)

View File

@ -1,9 +1,18 @@
import os
import subprocess # noqa import subprocess # noqa
from concurrent.futures.thread import ThreadPoolExecutor
from datetime import datetime, timedelta from datetime import datetime, timedelta
from functools import lru_cache, wraps from functools import lru_cache, wraps
from typing import Any from typing import Any
from constants import AUDIO_SEGMENT_DURATION
from loguru import logger from loguru import logger
from pydub import AudioSegment
from speech_recognition import (
AudioFile,
Recognizer,
UnknownValueError as SpeechRecognizerError,
)
def timed_cache(**timedelta_kwargs: Any) -> Any: def timed_cache(**timedelta_kwargs: Any) -> Any:
@ -27,13 +36,69 @@ def timed_cache(**timedelta_kwargs: Any) -> Any:
return _wrapper return _wrapper
def convert_file_to_wav(filename: str) -> str: class SpeechToTextService:
new_filename = filename + '.wav' def __init__(self, filename: str) -> None:
self.executor = ThreadPoolExecutor()
cmd = ['ffmpeg', '-loglevel', 'quiet', '-i', filename, '-vn', new_filename] self.filename = filename
self.recognizer = Recognizer()
self.recognizer.energy_threshold = 50
self.text_parts: dict[int, str] = {}
self.text_recognised = False
try: def get_text_from_audio(self) -> None:
subprocess.run(args=cmd) # noqa: S603 self.executor.submit(self.worker)
except Exception as error:
logger.error("cant convert voice: reason", error=error) def worker(self) -> Any:
return new_filename self._convert_file_to_wav()
self._convert_audio_to_text()
def _convert_audio_to_text(self) -> None:
wav_filename = f'{self.filename}.wav'
speech = AudioSegment.from_wav(wav_filename)
speech_duration = len(speech)
pieces = speech_duration // AUDIO_SEGMENT_DURATION + 1
ending = speech_duration % AUDIO_SEGMENT_DURATION
for i in range(pieces):
if i == 0 and pieces == 1:
sound_segment = speech[0:ending]
elif i == 0:
sound_segment = speech[0 : (i + 1) * AUDIO_SEGMENT_DURATION]
elif i == (pieces - 1):
sound_segment = speech[i * AUDIO_SEGMENT_DURATION - 250 : i * AUDIO_SEGMENT_DURATION + ending]
else:
sound_segment = speech[i * AUDIO_SEGMENT_DURATION - 250 : (i + 1) * AUDIO_SEGMENT_DURATION]
self.text_parts[i] = self._recognize_by_google(wav_filename, sound_segment)
self.text_recognised = True
# clean temp voice message main files
try:
os.remove(wav_filename)
os.remove(self.filename)
except FileNotFoundError as error:
logger.error("error temps files not deleted", error=error, filenames=[self.filename, self.filename])
def _convert_file_to_wav(self) -> None:
new_filename = self.filename + '.wav'
cmd = ['ffmpeg', '-loglevel', 'quiet', '-i', self.filename, '-vn', new_filename]
try:
subprocess.run(args=cmd) # noqa: S603
logger.info("file has been converted to wav", filename=new_filename)
except Exception as error:
logger.error("cant convert voice", error=error, filename=self.filename)
def _recognize_by_google(self, filename: str, sound_segment: AudioSegment) -> str:
tmp_filename = f"{filename}_tmp_part"
sound_segment.export(tmp_filename, format="wav")
with AudioFile(tmp_filename) as source:
audio_text = self.recognizer.listen(source)
try:
text = self.recognizer.recognize_google(audio_text, language='ru-RU')
os.remove(tmp_filename)
return text
except SpeechRecognizerError as error:
os.remove(tmp_filename)
logger.error("error recognizing text with google", error=error)
raise error

View File

@ -2,8 +2,10 @@ import asyncio
from functools import cached_property from functools import cached_property
import sentry_sdk import sentry_sdk
from constants import LogLevelEnum
from core.bot import BotApplication, BotQueue from core.bot import BotApplication, BotQueue
from core.handlers import bot_event_handlers from core.handlers import bot_event_handlers
from core.logging import configure_logging
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.responses import UJSONResponse from fastapi.responses import UJSONResponse
from routers import api_router from routers import api_router
@ -27,6 +29,7 @@ class Application:
self.app.include_router(api_router) self.app.include_router(api_router)
self.configure_hooks() self.configure_hooks()
configure_logging(level=LogLevelEnum.INFO, enable_json_logs=True, enable_sentry_logs=True)
if settings.SENTRY_DSN is not None: if settings.SENTRY_DSN is not None:
sentry_sdk.init( sentry_sdk.init(

View File

@ -1,6 +1,6 @@
# Telegram bot. Redirects to bot-service # Telegram bot. Redirects to bot-service
:8083 { :8083 {
reverse_proxy bot_service:8080 reverse_proxy bot_service:8000
header Strict-Transport-Security max-age=31536000; header Strict-Transport-Security max-age=31536000;
# Removing some headers for improved security: # Removing some headers for improved security:

View File

@ -28,9 +28,9 @@ RUN printf "================ Start build base service. with USER: ${USER} ======
WORKDIR /app/ WORKDIR /app/
RUN if [ "$USER" != "root" ]; then \ RUN if [ "$USER" != "root" ]; then \
mkdir /home/"$USER" \ mkdir /home/${USER} \
&& groupadd -r "$USER" && useradd -d /home/"$USER" -r -g "$USER" "$USER" \ && groupadd -r ${USER} && useradd -d /home/${USER} -r -g ${USER} ${USER} \
&& chown "$USER":"$USER" -R /home/"$USER"; \ && chown ${USER}:${USER} -R /home/${USER}; \
fi fi
COPY --chown=${USER}:${USER} ../poetry.lock ../pyproject.toml /app/ COPY --chown=${USER}:${USER} ../poetry.lock ../pyproject.toml /app/
@ -51,6 +51,7 @@ WORKDIR /app/
# Copying bot service # Copying bot service
COPY --chown=${USER}:${USER} ../bot_microservice /app/ COPY --chown=${USER}:${USER} ../bot_microservice /app/
RUN mkdir "/app/shared" -p && chown ${USER}:${USER} -R /app/shared
COPY ./scripts/start-bot.sh /app/ COPY ./scripts/start-bot.sh /app/
RUN chmod +x ./start-bot.sh RUN chmod +x ./start-bot.sh
@ -72,9 +73,6 @@ WORKDIR /app/
# Copying bot service # Copying bot service
COPY --chown=${USER}:${USER} ../chat_gpt_microservice /app/ COPY --chown=${USER}:${USER} ../chat_gpt_microservice /app/
COPY ./scripts/start-chat.sh /app/
RUN chmod +x ./start-chat.sh
COPY --from=compile-image /app/.venv /app/.venv COPY --from=compile-image /app/.venv /app/.venv
ENV PATH="/app/.venv/bin:$PATH" ENV PATH="/app/.venv/bin:$PATH"
@ -82,4 +80,4 @@ USER ${USER}
RUN chmod -R 777 translations RUN chmod -R 777 translations
CMD ["bash", "start-chat.sh"] CMD ["python3", "./run.py"]

View File

@ -6,7 +6,7 @@ if [[ "${START_WITH_WEBHOOK}" == "true" ]]
then then
echo "Starting bot in webhook mode..." echo "Starting bot in webhook mode..."
gunicorn main:create_app \ gunicorn main:create_app \
-- workers ${WORKERS_COUNT} \ --workers ${WORKERS_COUNT} \
--bind ${APP_HOST}:${APP_PORT} \ --bind ${APP_HOST}:${APP_PORT} \
--worker-class uvicorn.workers.UvicornWorker \ --worker-class uvicorn.workers.UvicornWorker \
--timeout 150 \ --timeout 150 \

View File

@ -1,11 +0,0 @@
#! /bin/bash
echo "starting chat"
gunicorn run:create_app \
-- workers ${WORKERS_COUNT} \
--bind ${APP_HOST}:${APP_PORT} \
--worker-class uvicorn.workers.UvicornWorker \
--timeout 150 \
--max-requests 2000 \
--max-requests-jitter 400