telegram-azure-transcribe-bot/bot.py

## Main bot
# A lot of the handler code for speechsdk is copy/pasted from the
# Azure documentation example for continuous recognition for compressed
# audio, but reworked to use pydub instead. 

from telegram import Update
from telegram.constants import ChatAction
from telegram.ext import ApplicationBuilder, ContextTypes, MessageHandler, CommandHandler, PicklePersistence
from telegram.ext.filters import VOICE, Chat
import logging
from os import unlink, environ
from dotenv import load_dotenv
from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer
from azure.cognitiveservices.speech.audio import AudioConfig
from asyncio import sleep
from pydub import AudioSegment
from tempfile import NamedTemporaryFile

async def clean_file_async(context: ContextTypes.DEFAULT_TYPE):
    job = context.job
    job.data['file'].close()
    unlink(job.data['file'].name)
async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE):
    logger = logging.getLogger('Recognizer')
    logger.info("Received voice message!")
    chat_id = update.effective_chat.id
    voice_id = update.message.id
    status_msg = await context.bot.send_message(
        chat_id=chat_id,
        text="Working on it!",
        reply_to_message_id=voice_id
    )
    def clean_file(file):
        context.job_queue.run_once(clean_file_async, 60.0, {'file':file})
    new_file = await context.bot.get_file(update.message.voice.file_id)
    ogg_file = NamedTemporaryFile(mode='wb', suffix='.oga', delete=False)
    logger.debug(f"Created temporary oga at {ogg_file.name}")
    ogg_file.close()
    await new_file.download_to_drive(ogg_file.name)
    ogg_seg = AudioSegment.from_ogg(ogg_file.name)
    logger.debug(f"Imported ogg to pydub, deleting {ogg_file.name}")
    clean_file(ogg_file)
    wav_file = NamedTemporaryFile(mode='wb', suffix=".wav", delete=False)
    logger.debug(f"Created temporary wav file at {wav_file.name} to transcribe")
    ogg_seg.export(wav_file, format="wav")
    wav_file.close()
    await status_msg.edit_text("Converted!")
    
    global speech_config
    audio_config = AudioConfig(filename=wav_file.name)
    speech_recognizer = SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
    done = False
    transcription = ""
    ### 
    # Commented code here is intended for whenever I figure out how
    # best to run async calls (send message/edit/delete) from a
    # synchronous function call, nested inside an async function.
    # Azure speech won't call functions async even when doing 'async
    #  recognition', annoyingly.
    ###
    # async def update_message_async(evt, current_message=current_message):
    #     """Takes event from Azure and updates a message object to show realtime transcribing"""
    #     logger.info(f"Async Message: {evt.result.text}")
    #     if not current_message:
    #         current_message = await context.bot.send_message(chat_id=chat_id, 
    #                                                    text=evt.result.text,
    #                                                    reply_to_message_id=voice_id)
    #     else:
    #         await current_message.edit_text(evt.result.text)
    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        logger.debug('CLOSING on {}'.format(evt))
        nonlocal done
        done = True
    def finish_message(evt):
        nonlocal transcription
        transcription += evt.result.text + " "
    speech_recognizer.recognized.connect(finish_message)
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition_async()
    if update.message.voice.duration > 120:
        await status_msg.edit_text("Recognizing. Due to audio length, this could take a few minutes. Please be patient. You'll get notified when it's completed.")
    else:
        await status_msg.edit_text('Recognizing...')
    while not done:
        await context.bot.send_chat_action(chat_id=chat_id,action=ChatAction.TYPING)
        await sleep(5)
    speech_recognizer.stop_continuous_recognition_async()
    await status_msg.delete()
    await context.bot.send_message(
        chat_id=chat_id,
        text=transcription,
        reply_to_message_id=voice_id
    )
    logger.debug(f"Cleaning up, deleting {wav_file.name}")
    clean_file(wav_file)

async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await context.bot.send_message(chat_id=update.effective_chat.id, text="Forward me a voice message and I'll transcribe it!")
async def help(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await context.bot.send_message(chat_id=update.effective_chat.id, text="Forward me a voice message. It may take a few minutes to transcribe. I'll reply to the voice message with a new one containing the transcription, so you can go do something else while you're waiting.")

if __name__ == '__main__':
    load_dotenv()

    logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=environ.get('LOG_LEVEL','WARN')
    )
    lg = logging.getLogger('main')
    global speech_config
    speech_config = SpeechConfig(subscription=environ.get('SPEECH_KEY'), region=environ.get('SPEECH_REGION'))
    pers = PicklePersistence(filepath='bot.pickle')
    bot_token = environ.get('TELEGRAM_BOT_TOKEN')
    application = ApplicationBuilder().token(bot_token)
    application = application.persistence(persistence=pers)
    application = application.build()
    start_handler = CommandHandler('start', start)
    help_handler = CommandHandler('help', help)
    voice_handler = MessageHandler(VOICE, handle_voice, block=True)
    if environ.get('TELEGRAM_BOT_ALLOWED_CHAT_IDS'):
        chat_ids = [int(x) for x in environ.get('TELEGRAM_BOT_ALLOWED_CHAT_IDS').split(',')]
        lg.info(f"Restricting to these chats: {chat_ids}")
        start_handler.filters = Chat(chat_id=chat_ids)
        help_handler.filters = Chat(chat_id=chat_ids)
        voice_handler.filters = VOICE & Chat(chat_id=chat_ids)
    application.add_handler(start_handler)
    application.add_handler(help_handler)
    application.add_handler(voice_handler)
    
    application.run_polling()
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`## Main bot`
			`# A lot of the handler code for speechsdk is copy/pasted from the`
			`# Azure documentation example for continuous recognition for compressed`
			`# audio, but reworked to use pydub instead.`

Initial bot commit 2023-11-30 09:38:24 -06:00			`from telegram import Update`
			`from telegram.constants import ChatAction`
			`from telegram.ext import ApplicationBuilder, ContextTypes, MessageHandler, CommandHandler, PicklePersistence`
Added restriction option for chats 2023-11-30 12:05:11 -06:00			`from telegram.ext.filters import VOICE, Chat`
Initial bot commit 2023-11-30 09:38:24 -06:00			`import logging`
Clean logging to permit config 2023-11-30 21:02:45 -06:00			`from os import unlink, environ`
Initial bot commit 2023-11-30 09:38:24 -06:00			`from dotenv import load_dotenv`
Slight visual cleanup 2023-11-30 12:29:47 -06:00			`from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer`
			`from azure.cognitiveservices.speech.audio import AudioConfig`
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`from asyncio import sleep`
Initial bot commit 2023-11-30 09:38:24 -06:00			`from pydub import AudioSegment`
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`from tempfile import NamedTemporaryFile`
Initial bot commit 2023-11-30 09:38:24 -06:00
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`async def clean_file_async(context: ContextTypes.DEFAULT_TYPE):`
			`job = context.job`
			`job.data['file'].close()`
			`unlink(job.data['file'].name)`
Initial bot commit 2023-11-30 09:38:24 -06:00			`async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE):`
			`logger = logging.getLogger('Recognizer')`
			`logger.info("Received voice message!")`
			`chat_id = update.effective_chat.id`
			`voice_id = update.message.id`
Slight visual cleanup 2023-11-30 12:29:47 -06:00			`status_msg = await context.bot.send_message(`
			`chat_id=chat_id,`
			`text="Working on it!",`
			`reply_to_message_id=voice_id`
			`)`
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`def clean_file(file):`
			`context.job_queue.run_once(clean_file_async, 60.0, {'file':file})`
Initial bot commit 2023-11-30 09:38:24 -06:00			`new_file = await context.bot.get_file(update.message.voice.file_id)`
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`ogg_file = NamedTemporaryFile(mode='wb', suffix='.oga', delete=False)`
			`logger.debug(f"Created temporary oga at {ogg_file.name}")`
			`ogg_file.close()`
			`await new_file.download_to_drive(ogg_file.name)`
			`ogg_seg = AudioSegment.from_ogg(ogg_file.name)`
			`logger.debug(f"Imported ogg to pydub, deleting {ogg_file.name}")`
			`clean_file(ogg_file)`
			`wav_file = NamedTemporaryFile(mode='wb', suffix=".wav", delete=False)`
			`logger.debug(f"Created temporary wav file at {wav_file.name} to transcribe")`
			`ogg_seg.export(wav_file, format="wav")`
			`wav_file.close()`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`await status_msg.edit_text("Converted!")`
Initial bot commit 2023-11-30 09:38:24 -06:00
			`global speech_config`
Slight visual cleanup 2023-11-30 12:29:47 -06:00			`audio_config = AudioConfig(filename=wav_file.name)`
			`speech_recognizer = SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)`
Initial bot commit 2023-11-30 09:38:24 -06:00			`done = False`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`transcription = ""`
			`###`
			`# Commented code here is intended for whenever I figure out how`
			`# best to run async calls (send message/edit/delete) from a`
			`# synchronous function call, nested inside an async function.`
			`# Azure speech won't call functions async even when doing 'async`
			`# recognition', annoyingly.`
			`###`
Initial bot commit 2023-11-30 09:38:24 -06:00			`# async def update_message_async(evt, current_message=current_message):`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`# """Takes event from Azure and updates a message object to show realtime transcribing"""`
Initial bot commit 2023-11-30 09:38:24 -06:00			`# logger.info(f"Async Message: {evt.result.text}")`
			`# if not current_message:`
			`# current_message = await context.bot.send_message(chat_id=chat_id,`
			`# text=evt.result.text,`
			`# reply_to_message_id=voice_id)`
			`# else:`
			`# await current_message.edit_text(evt.result.text)`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`def stop_cb(evt):`
			"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
			`logger.debug('CLOSING on {}'.format(evt))`
			`nonlocal done`
			`done = True`
Initial bot commit 2023-11-30 09:38:24 -06:00			`def finish_message(evt):`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`nonlocal transcription`
			`transcription += evt.result.text + " "`
Initial bot commit 2023-11-30 09:38:24 -06:00			`speech_recognizer.recognized.connect(finish_message)`
			`# stop continuous recognition on either session stopped or canceled events`
			`speech_recognizer.session_stopped.connect(stop_cb)`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`speech_recognizer.canceled.connect(stop_cb)`
Initial bot commit 2023-11-30 09:38:24 -06:00
			`# Start continuous speech recognition`
			`speech_recognizer.start_continuous_recognition_async()`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`if update.message.voice.duration > 120:`
			`await status_msg.edit_text("Recognizing. Due to audio length, this could take a few minutes. Please be patient. You'll get notified when it's completed.")`
			`else:`
			`await status_msg.edit_text('Recognizing...')`
Initial bot commit 2023-11-30 09:38:24 -06:00			`while not done:`
			`await context.bot.send_chat_action(chat_id=chat_id,action=ChatAction.TYPING)`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`await sleep(5)`
Initial bot commit 2023-11-30 09:38:24 -06:00			`speech_recognizer.stop_continuous_recognition_async()`
notify long rec, delete status msg & send new 2023-11-30 12:16:11 -06:00			`await status_msg.delete()`
Slight visual cleanup 2023-11-30 12:29:47 -06:00			`await context.bot.send_message(`
			`chat_id=chat_id,`
			`text=transcription,`
			`reply_to_message_id=voice_id`
			`)`
Create named temporary files for safety 2023-11-30 12:17:02 -06:00			`logger.debug(f"Cleaning up, deleting {wav_file.name}")`
			`clean_file(wav_file)`
Initial bot commit 2023-11-30 09:38:24 -06:00
			`async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):`
			`await context.bot.send_message(chat_id=update.effective_chat.id, text="Forward me a voice message and I'll transcribe it!")`
Added restriction option for chats 2023-11-30 12:05:11 -06:00			`async def help(update: Update, context: ContextTypes.DEFAULT_TYPE):`
			`await context.bot.send_message(chat_id=update.effective_chat.id, text="Forward me a voice message. It may take a few minutes to transcribe. I'll reply to the voice message with a new one containing the transcription, so you can go do something else while you're waiting.")`
Initial bot commit 2023-11-30 09:38:24 -06:00
			`if __name__ == '__main__':`
			`load_dotenv()`
Clean logging to permit config 2023-11-30 21:02:45 -06:00
			`logging.basicConfig(`
			`format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',`
			`level=environ.get('LOG_LEVEL','WARN')`
			`)`
			`lg = logging.getLogger('main')`
Initial bot commit 2023-11-30 09:38:24 -06:00			`global speech_config`
Clean logging to permit config 2023-11-30 21:02:45 -06:00			`speech_config = SpeechConfig(subscription=environ.get('SPEECH_KEY'), region=environ.get('SPEECH_REGION'))`
Initial bot commit 2023-11-30 09:38:24 -06:00			`pers = PicklePersistence(filepath='bot.pickle')`
Clean logging to permit config 2023-11-30 21:02:45 -06:00			`bot_token = environ.get('TELEGRAM_BOT_TOKEN')`
Added restriction option for chats 2023-11-30 12:05:11 -06:00			`application = ApplicationBuilder().token(bot_token)`
			`application = application.persistence(persistence=pers)`
			`application = application.build()`
Initial bot commit 2023-11-30 09:38:24 -06:00			`start_handler = CommandHandler('start', start)`
Added restriction option for chats 2023-11-30 12:05:11 -06:00			`help_handler = CommandHandler('help', help)`
			`voice_handler = MessageHandler(VOICE, handle_voice, block=True)`
Clean logging to permit config 2023-11-30 21:02:45 -06:00			`if environ.get('TELEGRAM_BOT_ALLOWED_CHAT_IDS'):`
			`chat_ids = [int(x) for x in environ.get('TELEGRAM_BOT_ALLOWED_CHAT_IDS').split(',')]`
Added restriction option for chats 2023-11-30 12:05:11 -06:00			`lg.info(f"Restricting to these chats: {chat_ids}")`
			`start_handler.filters = Chat(chat_id=chat_ids)`
			`help_handler.filters = Chat(chat_id=chat_ids)`
			`voice_handler.filters = VOICE & Chat(chat_id=chat_ids)`
Initial bot commit 2023-11-30 09:38:24 -06:00			`application.add_handler(start_handler)`
Added restriction option for chats 2023-11-30 12:05:11 -06:00			`application.add_handler(help_handler)`
Initial bot commit 2023-11-30 09:38:24 -06:00			`application.add_handler(voice_handler)`

			`application.run_polling()`