2023-11-30 12:17:02 -06:00
## Main bot
# A lot of the handler code for speechsdk is copy/pasted from the
# Azure documentation example for continuous recognition for compressed
# audio, but reworked to use pydub instead.
2023-11-30 09:38:24 -06:00
from telegram import Update
from telegram . constants import ChatAction
from telegram . ext import ApplicationBuilder , ContextTypes , MessageHandler , CommandHandler , PicklePersistence
2023-11-30 12:05:11 -06:00
from telegram . ext . filters import VOICE , Chat
2023-11-30 09:38:24 -06:00
import logging
2023-11-30 21:02:45 -06:00
from os import unlink , environ
2023-11-30 09:38:24 -06:00
from dotenv import load_dotenv
2023-11-30 12:29:47 -06:00
from azure . cognitiveservices . speech import SpeechConfig , SpeechRecognizer
from azure . cognitiveservices . speech . audio import AudioConfig
2023-11-30 12:17:02 -06:00
from asyncio import sleep
2023-11-30 09:38:24 -06:00
from pydub import AudioSegment
2023-11-30 12:17:02 -06:00
from tempfile import NamedTemporaryFile
2023-11-30 09:38:24 -06:00
2023-11-30 12:17:02 -06:00
async def clean_file_async ( context : ContextTypes . DEFAULT_TYPE ) :
job = context . job
job . data [ ' file ' ] . close ( )
unlink ( job . data [ ' file ' ] . name )
2023-11-30 09:38:24 -06:00
async def handle_voice ( update : Update , context : ContextTypes . DEFAULT_TYPE ) :
logger = logging . getLogger ( ' Recognizer ' )
logger . info ( " Received voice message! " )
chat_id = update . effective_chat . id
voice_id = update . message . id
2023-11-30 12:29:47 -06:00
status_msg = await context . bot . send_message (
chat_id = chat_id ,
text = " Working on it! " ,
reply_to_message_id = voice_id
)
2023-11-30 12:17:02 -06:00
def clean_file ( file ) :
context . job_queue . run_once ( clean_file_async , 60.0 , { ' file ' : file } )
2023-11-30 09:38:24 -06:00
new_file = await context . bot . get_file ( update . message . voice . file_id )
2023-11-30 12:17:02 -06:00
ogg_file = NamedTemporaryFile ( mode = ' wb ' , suffix = ' .oga ' , delete = False )
logger . debug ( f " Created temporary oga at { ogg_file . name } " )
ogg_file . close ( )
await new_file . download_to_drive ( ogg_file . name )
ogg_seg = AudioSegment . from_ogg ( ogg_file . name )
logger . debug ( f " Imported ogg to pydub, deleting { ogg_file . name } " )
clean_file ( ogg_file )
wav_file = NamedTemporaryFile ( mode = ' wb ' , suffix = " .wav " , delete = False )
logger . debug ( f " Created temporary wav file at { wav_file . name } to transcribe " )
ogg_seg . export ( wav_file , format = " wav " )
wav_file . close ( )
2023-11-30 12:16:11 -06:00
await status_msg . edit_text ( " Converted! " )
2023-11-30 09:38:24 -06:00
global speech_config
2023-11-30 12:29:47 -06:00
audio_config = AudioConfig ( filename = wav_file . name )
speech_recognizer = SpeechRecognizer ( speech_config = speech_config , audio_config = audio_config )
2023-11-30 09:38:24 -06:00
done = False
2023-11-30 12:16:11 -06:00
transcription = " "
###
# Commented code here is intended for whenever I figure out how
# best to run async calls (send message/edit/delete) from a
# synchronous function call, nested inside an async function.
# Azure speech won't call functions async even when doing 'async
# recognition', annoyingly.
###
2023-11-30 09:38:24 -06:00
# async def update_message_async(evt, current_message=current_message):
2023-11-30 12:16:11 -06:00
# """Takes event from Azure and updates a message object to show realtime transcribing"""
2023-11-30 09:38:24 -06:00
# logger.info(f"Async Message: {evt.result.text}")
# if not current_message:
# current_message = await context.bot.send_message(chat_id=chat_id,
# text=evt.result.text,
# reply_to_message_id=voice_id)
# else:
# await current_message.edit_text(evt.result.text)
2023-11-30 12:16:11 -06:00
def stop_cb ( evt ) :
""" callback that signals to stop continuous recognition upon receiving an event `evt` """
logger . debug ( ' CLOSING on {} ' . format ( evt ) )
nonlocal done
done = True
2023-11-30 09:38:24 -06:00
def finish_message ( evt ) :
2023-11-30 12:16:11 -06:00
nonlocal transcription
transcription + = evt . result . text + " "
2023-11-30 09:38:24 -06:00
speech_recognizer . recognized . connect ( finish_message )
# stop continuous recognition on either session stopped or canceled events
speech_recognizer . session_stopped . connect ( stop_cb )
2023-11-30 12:16:11 -06:00
speech_recognizer . canceled . connect ( stop_cb )
2023-11-30 09:38:24 -06:00
# Start continuous speech recognition
speech_recognizer . start_continuous_recognition_async ( )
2023-11-30 12:16:11 -06:00
if update . message . voice . duration > 120 :
await status_msg . edit_text ( " Recognizing. Due to audio length, this could take a few minutes. Please be patient. You ' ll get notified when it ' s completed. " )
else :
await status_msg . edit_text ( ' Recognizing... ' )
2023-11-30 09:38:24 -06:00
while not done :
await context . bot . send_chat_action ( chat_id = chat_id , action = ChatAction . TYPING )
2023-11-30 12:16:11 -06:00
await sleep ( 5 )
2023-11-30 09:38:24 -06:00
speech_recognizer . stop_continuous_recognition_async ( )
2023-11-30 12:16:11 -06:00
await status_msg . delete ( )
2023-11-30 12:29:47 -06:00
await context . bot . send_message (
chat_id = chat_id ,
text = transcription ,
reply_to_message_id = voice_id
)
2023-11-30 12:17:02 -06:00
logger . debug ( f " Cleaning up, deleting { wav_file . name } " )
clean_file ( wav_file )
2023-11-30 09:38:24 -06:00
async def start ( update : Update , context : ContextTypes . DEFAULT_TYPE ) :
await context . bot . send_message ( chat_id = update . effective_chat . id , text = " Forward me a voice message and I ' ll transcribe it! " )
2023-11-30 12:05:11 -06:00
async def help ( update : Update , context : ContextTypes . DEFAULT_TYPE ) :
await context . bot . send_message ( chat_id = update . effective_chat . id , text = " Forward me a voice message. It may take a few minutes to transcribe. I ' ll reply to the voice message with a new one containing the transcription, so you can go do something else while you ' re waiting. " )
2023-11-30 09:38:24 -06:00
if __name__ == ' __main__ ' :
load_dotenv ( )
2023-11-30 21:02:45 -06:00
logging . basicConfig (
format = ' %(asctime)s - %(name)s - %(levelname)s - %(message)s ' ,
level = environ . get ( ' LOG_LEVEL ' , ' WARN ' )
)
lg = logging . getLogger ( ' main ' )
2023-11-30 09:38:24 -06:00
global speech_config
2023-11-30 21:02:45 -06:00
speech_config = SpeechConfig ( subscription = environ . get ( ' SPEECH_KEY ' ) , region = environ . get ( ' SPEECH_REGION ' ) )
2023-11-30 09:38:24 -06:00
pers = PicklePersistence ( filepath = ' bot.pickle ' )
2023-11-30 21:02:45 -06:00
bot_token = environ . get ( ' TELEGRAM_BOT_TOKEN ' )
2023-11-30 12:05:11 -06:00
application = ApplicationBuilder ( ) . token ( bot_token )
application = application . persistence ( persistence = pers )
application = application . build ( )
2023-11-30 09:38:24 -06:00
start_handler = CommandHandler ( ' start ' , start )
2023-11-30 12:05:11 -06:00
help_handler = CommandHandler ( ' help ' , help )
voice_handler = MessageHandler ( VOICE , handle_voice , block = True )
2023-11-30 21:02:45 -06:00
if environ . get ( ' TELEGRAM_BOT_ALLOWED_CHAT_IDS ' ) :
chat_ids = [ int ( x ) for x in environ . get ( ' TELEGRAM_BOT_ALLOWED_CHAT_IDS ' ) . split ( ' , ' ) ]
2023-11-30 12:05:11 -06:00
lg . info ( f " Restricting to these chats: { chat_ids } " )
start_handler . filters = Chat ( chat_id = chat_ids )
help_handler . filters = Chat ( chat_id = chat_ids )
voice_handler . filters = VOICE & Chat ( chat_id = chat_ids )
2023-11-30 09:38:24 -06:00
application . add_handler ( start_handler )
2023-11-30 12:05:11 -06:00
application . add_handler ( help_handler )
2023-11-30 09:38:24 -06:00
application . add_handler ( voice_handler )
application . run_polling ( )