%%capture
# Not needed on Colab
%pip install librosa
%pip install numpy matplotlib

%pip install tqdm

According to the librosa documentation, these need to be manually installed:

%%capture
%pip install samplerate
%pip install resampy

LIBROSA_RESAMPLING_METHODS = {
    "soxr_vhq", "soxr_hq", "soxr_mq", "soxr_lq",
    "soxr_qq",
    "kaiser_best", "kaiser_fast",
    "fft", "scipy",
    "polyphase",
    "linear",
    "zero_order_hold",
    "sinc_best", "sinc_medium", "sinc_fastest"
}

Set up Whisper from git:

%%capture
%pip install git+https://github.com/openai/whisper.git

%%capture
%pip install jiwer

import whisper
model = whisper.load_model("large-v3")

import whisper
model_tiny_en = whisper.load_model("tiny.en")

Sine tone

Generate a sine tone (from librosa documentation)

import librosa
tone = librosa.tone(440, sr=22050, length=22050)

from IPython.display import Audio, display

display(Audio(tone, rate=22050))

import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots()
S = librosa.feature.melspectrogram(y=tone)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max),
                         x_axis='time', y_axis='mel', ax=ax)

<matplotlib.collections.QuadMesh at 0x7f6e1c786650>

!wget https://dagshub.com/DagsHub/Librispeech-ASR-corpus/raw/2fead768d9690a42d186188ed77a6d4c63c949dd/dev-clean/84/121123/84-121123-0003.flac

--2023-12-21 15:36:30--  https://dagshub.com/DagsHub/Librispeech-ASR-corpus/raw/2fead768d9690a42d186188ed77a6d4c63c949dd/dev-clean/84/121123/84-121123-0003.flac
Resolving dagshub.com (dagshub.com)... 35.186.200.224
Connecting to dagshub.com (dagshub.com)|35.186.200.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘84-121123-0003.flac’

84-121123-0003.flac     [ <=>                ] 122.19K  --.-KB/s    in 0.02s   

2023-12-21 15:36:30 (6.85 MB/s) - ‘84-121123-0003.flac’ saved [125125]

sample_audio, sample_audio_rate = librosa.load("84-121123-0003.flac")

Audio(sample_audio, rate=sample_audio_rate)

sample_text = "AND THE CRY ISSUED FROM HIS PORES IF WE MAY THUS SPEAK A CRY FRIGHTFUL IN ITS SILENCE"

def clean_text(text):
    text = text.lower()
    text = text.replace(",", "")
    text = text.replace(";", "")
    text = text.replace("!", "")
    text = text.replace("?", "")
    text = text.replace(":", "")
    text = text.replace(".", "")
    return text.strip()

tiny_pred = model_tiny_en.transcribe(sample_audio)["text"]

from jiwer import wer
assert wer(clean_text(tiny_pred), clean_text(sample_text)) == 0.0

0.0

from tqdm import tqdm

def resampling_wobble(audio_in, orig_sr=16_000, target_sr=22_050, res_type="kaiser_best"):
    audio_up = librosa.resample(audio_in, orig_sr=orig_sr, target_sr=target_sr, res_type=res_type)
    audio_down = librosa.resample(audio_up, orig_sr=target_sr, target_sr=orig_sr, res_type=res_type)
    return audio_down

import copy

results = {x: [(0, 0.0)] for x in LIBROSA_RESAMPLING_METHODS}
last_wer = {x: 0.0 for x in LIBROSA_RESAMPLING_METHODS}
audio = {x: copy.deepcopy(sample_audio) for x in LIBROSA_RESAMPLING_METHODS}
last_empty = {x: False for x in LIBROSA_RESAMPLING_METHODS}
sample_clean = clean_text(sample_text)
for num_tries in tqdm(range(1, 5000)):
    for algo in LIBROSA_RESAMPLING_METHODS:
        audio_in = audio[algo]
        audio_down = resampling_wobble(audio_in, orig_sr=16_000, target_sr=22_050, res_type=algo)
        pred = model_tiny_en.transcribe(audio_down)["text"]
        pred_clean = clean_text(pred)
        if not pred_clean or pred_clean == "":
            if not last_empty[algo]:
                results[algo].append((num_tries, None))
                last_empty[algo] = True
        else:
            wer_res = wer(pred_clean, sample_clean)
            audio[algo] = audio_down
            if wer_res != last_wer[algo]:
                last_wer[algo] = wer_res
                results[algo].append((num_tries, wer_res))
                print(num_tries, wer_res)

  0%|          | 0/4999 [00:00<?, ?it/s] 21%|██▏       | 1074/4999 [1:13:38<3:47:44,  3.48s/it]

start_over = copy.deepcopy(sample_audio)

Audio(start_over, rate=22_050)

for _ in range(0, 2):
    audio_up = librosa.resample(start_over, orig_sr=16_000, target_sr=22_050, res_type=algo)
    audio_down = librosa.resample(audio_in, orig_sr=22_050, target_sr=16_000, res_type=algo)