WER for wav2vec2 small 960 on 5-40mins
WER check using AWB dataset, for wav2vec2 small 960 on 5-40 mins of audio
Original on kaggle
%%capture
!pip install transformers datasets jiwer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
from datasets import Dataset
import soundfile as sf
import torch
from jiwer import wer
test_ids = []
with open("../input/cmu-us-awb-arctic-fairseq-files/test.tsv") as tsvf:
for line in tsvf.readlines()[1:]:
parts = line.split("\t")
test_ids.append(parts[0].replace(".wav", ""))
transcripts = {}
with open("../input/cmu-us-awb-arctic-fairseq-files/text.tsv") as tsf:
for line in tsf.readlines():
parts = line.strip().split("\t")
transcripts[parts[0]] = parts[1].upper()
paths = []
text = []
for id in test_ids:
paths.append(f"/kaggle/input/cmu-us-awb-arctic-tts-dataset/cmu_us_awb_arctic/wav/{id}.wav")
text.append(transcripts[id])
dataset = Dataset.from_dict({"file": paths, "text": text})
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="5mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="10mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="15mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="20mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="25mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="30mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="35mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))
from datasets import load_dataset
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec-awb", revision="40mins")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
dataset = dataset.map(map_to_array)
def map_to_pred(batch):
input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
batch["transcription"] = transcription
return batch
result = dataset.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
print("WER:", wer(result["text"], result["transcription"]))