%%capture
!pip install azure-cognitiveservices-speech

%%capture
!pip install youtube-dl

%%capture
!youtube-dl https://www.youtube.com/watch?v=cfjdfaqWY3Y

%%capture
!ffmpeg -i Cúla4\ Ar\ Scoil\ _\ Ábhar\ -\ Mata\ _\ Téama\ -\ Bia-cfjdfaqWY3Y.mkv -acodec pcm_s16le -ac 1 -ar 16000 cfjdfaqWY3Y.wav

import IPython
IPython.display.Audio('/content/cfjdfaqWY3Y.wav')

import azure.cognitiveservices.speech as speechsdk

Use either Key1 or Key2 (on Azure Portal, in "Keys and Endpoints" from the menu on the left hand side of the screen).

_SUBS=''

_LOC='westeurope'

speech_config = speechsdk.SpeechConfig(region=_LOC, subscription=_SUBS)

audio_input=speechsdk.audio.AudioConfig(filename='cfjdfaqWY3Y.wav')

speech_config.speech_recognition_language = 'ga-IE'
speech_config.request_word_level_timestamps()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_config.endpoint_id='https://westeurope.api.cognitive.microsoft.com/sts/v1.0/issuetoken'

speech_config.set_property(speechsdk.PropertyId.Speech_LogFilename, "azure.log")

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
import time
def speech_recognize_continuous_from_file(speech_config, audio_config):
    """performs continuous speech recognition with input from an audio file"""
    speech_config = speech_config
    audio_config = audio_config

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language='ga-IE', audio_config=audio_config)
    

    done = False

    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def cancelled(evt):
      result = evt.result
      cancellation_details = result.cancellation_details
      print("Speech Recognition canceled: {}".format(cancellation_details.reason))
      if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(cancelled)
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    speech_recognizer.stop_continuous_recognition()

speech_recognize_continuous_from_file(speech_config, audio_input)

Debugging with curl

!curl -v -X POST "https://{_LOC}.api.cognitive.microsoft.com/sts/v1.0/issueToken" -H "Ocp-Apim-Subscription-Key: {_SUBS}" -H "Content-type: application/x-www-form-urlencoded" -H "Content-Length: 0"

_TOK=''

!curl -v -X POST "https://{_LOC}.stt.speech.microsoft.com/speech/recognition/interactive/cognitiveservices/v1?language=ga-IE" -H "Authorization: Bearer {_TOK}" -H "Transfer-Encoding: chunked" -H "Content-type: audio/wav; codec=audio/pcm; samplerate=16000" --data-binary @cfjdfaqWY3Y.wav

Next step, get at the innards (TODO)

transcript_display_list = []
transcript_ITN_list = []
confidence_list = []
words = []

def parse_azure_result(evt):
  import json
  response = json.loads(evt.result.json)
  transcript_display_list.append(response['DisplayText'])
  confidence_list_temp = [item.get('Confidence') for item in response['NBest']]
  max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
  confidence_list.append(response['NBest'][max_confidence_index]['Confidence'])
  transcript_ITN_list.append(response['NBest'][max_confidence_index]['ITN'])
  words.extend(response['NBest'][max_confidence_index]['Words'])
  logger.debug(evt)