Azure speech recognition for Irish
Authentication is a bit of a pain; recognition is fantastic, but let down by number inverse normalisation
%%capture
!pip install azure-cognitiveservices-speech
%%capture
!pip install youtube-dl
%%capture
!youtube-dl https://www.youtube.com/watch?v=cfjdfaqWY3Y
%%capture
!ffmpeg -i Cúla4\ Ar\ Scoil\ _\ Ábhar\ -\ Mata\ _\ Téama\ -\ Bia-cfjdfaqWY3Y.mkv -acodec pcm_s16le -ac 1 -ar 16000 cfjdfaqWY3Y.wav
import IPython
IPython.display.Audio('/content/cfjdfaqWY3Y.wav')
import azure.cognitiveservices.speech as speechsdk
Use either Key1 or Key2 (on Azure Portal, in "Keys and Endpoints" from the menu on the left hand side of the screen).
_SUBS=''
_LOC='westeurope'
speech_config = speechsdk.SpeechConfig(region=_LOC, subscription=_SUBS)
audio_input=speechsdk.audio.AudioConfig(filename='cfjdfaqWY3Y.wav')
speech_config.speech_recognition_language = 'ga-IE'
speech_config.request_word_level_timestamps()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_config.endpoint_id='https://westeurope.api.cognitive.microsoft.com/sts/v1.0/issuetoken'
speech_config.set_property(speechsdk.PropertyId.Speech_LogFilename, "azure.log")
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
import time
def speech_recognize_continuous_from_file(speech_config, audio_config):
"""performs continuous speech recognition with input from an audio file"""
speech_config = speech_config
audio_config = audio_config
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language='ga-IE', audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
def cancelled(evt):
result = evt.result
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(cancelled)
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
speech_recognize_continuous_from_file(speech_config, audio_input)
Debugging with curl
!curl -v -X POST "https://{_LOC}.api.cognitive.microsoft.com/sts/v1.0/issueToken" -H "Ocp-Apim-Subscription-Key: {_SUBS}" -H "Content-type: application/x-www-form-urlencoded" -H "Content-Length: 0"
_TOK=''
!curl -v -X POST "https://{_LOC}.stt.speech.microsoft.com/speech/recognition/interactive/cognitiveservices/v1?language=ga-IE" -H "Authorization: Bearer {_TOK}" -H "Transfer-Encoding: chunked" -H "Content-type: audio/wav; codec=audio/pcm; samplerate=16000" --data-binary @cfjdfaqWY3Y.wav
Next step, get at the innards (TODO)
transcript_display_list = []
transcript_ITN_list = []
confidence_list = []
words = []
def parse_azure_result(evt):
import json
response = json.loads(evt.result.json)
transcript_display_list.append(response['DisplayText'])
confidence_list_temp = [item.get('Confidence') for item in response['NBest']]
max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
confidence_list.append(response['NBest'][max_confidence_index]['Confidence'])
transcript_ITN_list.append(response['NBest'][max_confidence_index]['ITN'])
words.extend(response['NBest'][max_confidence_index]['Words'])
logger.debug(evt)