telegram-azure-transcribe-bot/bot.py

135 lines
6.3 KiB
Python
Raw Normal View History

## Main bot
# A lot of the handler code for speechsdk is copy/pasted from the
# Azure documentation example for continuous recognition for compressed
# audio, but reworked to use pydub instead.
2023-11-30 09:38:24 -06:00
from telegram import Update
from telegram.constants import ChatAction
from telegram.ext import ApplicationBuilder, ContextTypes, MessageHandler, CommandHandler, PicklePersistence
2023-11-30 12:05:11 -06:00
from telegram.ext.filters import VOICE, Chat
2023-11-30 09:38:24 -06:00
import logging
2023-11-30 21:02:45 -06:00
from os import unlink, environ
2023-11-30 09:38:24 -06:00
from dotenv import load_dotenv
2023-11-30 12:29:47 -06:00
from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer
from azure.cognitiveservices.speech.audio import AudioConfig
from asyncio import sleep
2023-11-30 09:38:24 -06:00
from pydub import AudioSegment
from tempfile import NamedTemporaryFile
2023-11-30 09:38:24 -06:00
async def clean_file_async(context: ContextTypes.DEFAULT_TYPE):
job = context.job
job.data['file'].close()
unlink(job.data['file'].name)
2023-11-30 09:38:24 -06:00
async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE):
logger = logging.getLogger('Recognizer')
logger.info("Received voice message!")
chat_id = update.effective_chat.id
voice_id = update.message.id
2023-11-30 12:29:47 -06:00
status_msg = await context.bot.send_message(
chat_id=chat_id,
text="Working on it!",
reply_to_message_id=voice_id
)
def clean_file(file):
context.job_queue.run_once(clean_file_async, 60.0, {'file':file})
2023-11-30 09:38:24 -06:00
new_file = await context.bot.get_file(update.message.voice.file_id)
ogg_file = NamedTemporaryFile(mode='wb', suffix='.oga', delete=False)
logger.debug(f"Created temporary oga at {ogg_file.name}")
ogg_file.close()
await new_file.download_to_drive(ogg_file.name)
ogg_seg = AudioSegment.from_ogg(ogg_file.name)
logger.debug(f"Imported ogg to pydub, deleting {ogg_file.name}")
clean_file(ogg_file)
wav_file = NamedTemporaryFile(mode='wb', suffix=".wav", delete=False)
logger.debug(f"Created temporary wav file at {wav_file.name} to transcribe")
ogg_seg.export(wav_file, format="wav")
wav_file.close()
await status_msg.edit_text("Converted!")
2023-11-30 09:38:24 -06:00
global speech_config
2023-11-30 12:29:47 -06:00
audio_config = AudioConfig(filename=wav_file.name)
speech_recognizer = SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
2023-11-30 09:38:24 -06:00
done = False
transcription = ""
###
# Commented code here is intended for whenever I figure out how
# best to run async calls (send message/edit/delete) from a
# synchronous function call, nested inside an async function.
# Azure speech won't call functions async even when doing 'async
# recognition', annoyingly.
###
2023-11-30 09:38:24 -06:00
# async def update_message_async(evt, current_message=current_message):
# """Takes event from Azure and updates a message object to show realtime transcribing"""
2023-11-30 09:38:24 -06:00
# logger.info(f"Async Message: {evt.result.text}")
# if not current_message:
# current_message = await context.bot.send_message(chat_id=chat_id,
# text=evt.result.text,
# reply_to_message_id=voice_id)
# else:
# await current_message.edit_text(evt.result.text)
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
logger.debug('CLOSING on {}'.format(evt))
nonlocal done
done = True
2023-11-30 09:38:24 -06:00
def finish_message(evt):
nonlocal transcription
transcription += evt.result.text + " "
2023-11-30 09:38:24 -06:00
speech_recognizer.recognized.connect(finish_message)
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
2023-11-30 09:38:24 -06:00
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition_async()
if update.message.voice.duration > 120:
await status_msg.edit_text("Recognizing. Due to audio length, this could take a few minutes. Please be patient. You'll get notified when it's completed.")
else:
await status_msg.edit_text('Recognizing...')
2023-11-30 09:38:24 -06:00
while not done:
await context.bot.send_chat_action(chat_id=chat_id,action=ChatAction.TYPING)
await sleep(5)
2023-11-30 09:38:24 -06:00
speech_recognizer.stop_continuous_recognition_async()
await status_msg.delete()
2023-11-30 12:29:47 -06:00
await context.bot.send_message(
chat_id=chat_id,
text=transcription,
reply_to_message_id=voice_id
)
logger.debug(f"Cleaning up, deleting {wav_file.name}")
clean_file(wav_file)
2023-11-30 09:38:24 -06:00
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
await context.bot.send_message(chat_id=update.effective_chat.id, text="Forward me a voice message and I'll transcribe it!")
2023-11-30 12:05:11 -06:00
async def help(update: Update, context: ContextTypes.DEFAULT_TYPE):
await context.bot.send_message(chat_id=update.effective_chat.id, text="Forward me a voice message. It may take a few minutes to transcribe. I'll reply to the voice message with a new one containing the transcription, so you can go do something else while you're waiting.")
2023-11-30 09:38:24 -06:00
if __name__ == '__main__':
load_dotenv()
2023-11-30 21:02:45 -06:00
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=environ.get('LOG_LEVEL','WARN')
)
lg = logging.getLogger('main')
2023-11-30 09:38:24 -06:00
global speech_config
2023-11-30 21:02:45 -06:00
speech_config = SpeechConfig(subscription=environ.get('SPEECH_KEY'), region=environ.get('SPEECH_REGION'))
2023-11-30 09:38:24 -06:00
pers = PicklePersistence(filepath='bot.pickle')
2023-11-30 21:02:45 -06:00
bot_token = environ.get('TELEGRAM_BOT_TOKEN')
2023-11-30 12:05:11 -06:00
application = ApplicationBuilder().token(bot_token)
application = application.persistence(persistence=pers)
application = application.build()
2023-11-30 09:38:24 -06:00
start_handler = CommandHandler('start', start)
2023-11-30 12:05:11 -06:00
help_handler = CommandHandler('help', help)
voice_handler = MessageHandler(VOICE, handle_voice, block=True)
2023-11-30 21:02:45 -06:00
if environ.get('TELEGRAM_BOT_ALLOWED_CHAT_IDS'):
chat_ids = [int(x) for x in environ.get('TELEGRAM_BOT_ALLOWED_CHAT_IDS').split(',')]
2023-11-30 12:05:11 -06:00
lg.info(f"Restricting to these chats: {chat_ids}")
start_handler.filters = Chat(chat_id=chat_ids)
help_handler.filters = Chat(chat_id=chat_ids)
voice_handler.filters = VOICE & Chat(chat_id=chat_ids)
2023-11-30 09:38:24 -06:00
application.add_handler(start_handler)
2023-11-30 12:05:11 -06:00
application.add_handler(help_handler)
2023-11-30 09:38:24 -06:00
application.add_handler(voice_handler)
application.run_polling()