Playing with resampling pairs
Does the resampling method affect the audio?
%%capture
# Not needed on Colab
%pip install librosa
%pip install numpy matplotlib
%pip install tqdm
According to the librosa documentation, these need to be manually installed:
%%capture
%pip install samplerate
%pip install resampy
LIBROSA_RESAMPLING_METHODS = {
"soxr_vhq", "soxr_hq", "soxr_mq", "soxr_lq",
"soxr_qq",
"kaiser_best", "kaiser_fast",
"fft", "scipy",
"polyphase",
"linear",
"zero_order_hold",
"sinc_best", "sinc_medium", "sinc_fastest"
}
Set up Whisper from git:
%%capture
%pip install git+https://github.com/openai/whisper.git
%%capture
%pip install jiwer
import whisper
model = whisper.load_model("large-v3")
import whisper
model_tiny_en = whisper.load_model("tiny.en")
Generate a sine tone (from librosa documentation)
import librosa
tone = librosa.tone(440, sr=22050, length=22050)
from IPython.display import Audio, display
display(Audio(tone, rate=22050))
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots()
S = librosa.feature.melspectrogram(y=tone)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max),
x_axis='time', y_axis='mel', ax=ax)
!wget https://dagshub.com/DagsHub/Librispeech-ASR-corpus/raw/2fead768d9690a42d186188ed77a6d4c63c949dd/dev-clean/84/121123/84-121123-0003.flac
sample_audio, sample_audio_rate = librosa.load("84-121123-0003.flac")
Audio(sample_audio, rate=sample_audio_rate)
sample_text = "AND THE CRY ISSUED FROM HIS PORES IF WE MAY THUS SPEAK A CRY FRIGHTFUL IN ITS SILENCE"
def clean_text(text):
text = text.lower()
text = text.replace(",", "")
text = text.replace(";", "")
text = text.replace("!", "")
text = text.replace("?", "")
text = text.replace(":", "")
text = text.replace(".", "")
return text.strip()
tiny_pred = model_tiny_en.transcribe(sample_audio)["text"]
from jiwer import wer
assert wer(clean_text(tiny_pred), clean_text(sample_text)) == 0.0
from tqdm import tqdm
def resampling_wobble(audio_in, orig_sr=16_000, target_sr=22_050, res_type="kaiser_best"):
audio_up = librosa.resample(audio_in, orig_sr=orig_sr, target_sr=target_sr, res_type=res_type)
audio_down = librosa.resample(audio_up, orig_sr=target_sr, target_sr=orig_sr, res_type=res_type)
return audio_down
import copy
results = {x: [(0, 0.0)] for x in LIBROSA_RESAMPLING_METHODS}
last_wer = {x: 0.0 for x in LIBROSA_RESAMPLING_METHODS}
audio = {x: copy.deepcopy(sample_audio) for x in LIBROSA_RESAMPLING_METHODS}
last_empty = {x: False for x in LIBROSA_RESAMPLING_METHODS}
sample_clean = clean_text(sample_text)
for num_tries in tqdm(range(1, 5000)):
for algo in LIBROSA_RESAMPLING_METHODS:
audio_in = audio[algo]
audio_down = resampling_wobble(audio_in, orig_sr=16_000, target_sr=22_050, res_type=algo)
pred = model_tiny_en.transcribe(audio_down)["text"]
pred_clean = clean_text(pred)
if not pred_clean or pred_clean == "":
if not last_empty[algo]:
results[algo].append((num_tries, None))
last_empty[algo] = True
else:
wer_res = wer(pred_clean, sample_clean)
audio[algo] = audio_down
if wer_res != last_wer[algo]:
last_wer[algo] = wer_res
results[algo].append((num_tries, wer_res))
print(num_tries, wer_res)
start_over = copy.deepcopy(sample_audio)
Audio(start_over, rate=22_050)
for _ in range(0, 2):
audio_up = librosa.resample(start_over, orig_sr=16_000, target_sr=22_050, res_type=algo)
audio_down = librosa.resample(audio_in, orig_sr=22_050, target_sr=16_000, res_type=algo)