Original on Kaggle

%%capture
!pip install transformers datasets jiwer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
from datasets import Dataset
import soundfile as sf
import torch
from jiwer import wer
test_ids = []
with open("../input/create-ljspeech-splits/test.tsv") as tsvf:
    for line in tsvf.readlines()[1:]:
        parts = line.split("\t")
        test_ids.append(parts[0].replace(".wav", ""))
transcripts = {}
with open("../input/ljspeech-for-asr/transcripts.tsv") as tsf:
    for line in tsf.readlines():
        parts = line.strip().split("\t")
        transcripts[parts[0]] = parts[1].upper()
paths = []
text = []
for id in test_ids:
    paths.append(f"/kaggle/input/ljspeech-for-asr/wav16/{id}.wav")
    text.append(transcripts[id])
    
dataset = Dataset.from_dict({"file": paths, "text": text})
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

dataset = dataset.map(map_to_array)
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
/opt/conda/lib/python3.7/site-packages/transformers/models/wav2vec2/tokenization_wav2vec2.py:748: FutureWarning: The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.
  FutureWarning,
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="35mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.03222965833039803
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="40mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.03126100739697076
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="45mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.03196548080309968
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="50mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.028883409651285663
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="55mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.029059528002817893
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="60mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.028531172948221203
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="65mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.02694610778443114
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="70mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.029411764705882353
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="75mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.02588939767523776
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-ljspeech-splits", revision="80mins")

def map_to_pred(batch):
    input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))
WER: 0.027738640366326173