from transformers import pipeline
EN_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"

pipe_en = pipeline(model=EN_MODEL, device=0)
sentences = [
	"So, we decided to go to the zoo, and... uh, the first thing we saw was the lions. [laughs] They were just lazing around.",
	"[gasps] Oh, and you won't believe the size of the elephants. They were HUGE.",
	"We were walking past the monkey enclosure when one of them [laughs]—seriously, it started mimicking us!",
	"[sighs] The weather was kinda hot, but... it was worth it, seeing all those animals up close.",
	"And the penguins, they were so cute, waddling around. [laughs] I could watch them all day.",
	"I was a bit scared of the snakes, not gonna lie. Every time one moved, I just... [gasps] jumped a little.",
	"[clears throat] Oh, and the food there? Surprisingly good. We had these amazing burgers right by the giraffe area.",
	"The kids were just fascinated by the aquarium section. All those colorful fish and... uh, the sharks were a bit scary, though.",
	"[laughter] You should've seen your face when that parrot started talking! Priceless.",
	"And... um, at the end of the day, we were all just so tired, but, you know, it was a great day. [sighs] Really special."
]
from pathlib import Path

outputs = []
files = []

for file in Path("/home/joregan/shivam-sentences").glob("*.wav"):
    files.append(file.name)
    en_out = pipe_en(str(file), chunk_length_s=10, return_timestamps="word")
    outputs.append(en_out)
from string import punctuation

PUNCT = set(punctuation)

def clean_sentence(text):
    words = []
    text = text.replace("—", " ")
    for word in text.split(" "):
        if word.startswith("[") and word.endswith("]"):
            continue
        while word[0:1] in PUNCT:
            word = word[1:]
        while word[-1:] in PUNCT:
            word = word[:-1]
        words.append(word.lower())
    return " ".join(words)
clean_sentences = [clean_sentence(x) for x in sentences]
numbers = [int(x.replace(".wav", "")) for x in files]
text_out = [x["text"] for x in outputs]
tmp_sort = sorted([x for x in zip(numbers, text_out)])
hyp = [x[1] for x in tmp_sort]
from jiwer import wer

wer(clean_sentences, hyp)
>>> wer(clean_sentences, hyp)
0.40782122905027934
import whisper

model = whisper.load_model("large-v3")
outputs = []
for idx, sentence in enumerate(sentences):
    file=f"/home/joregan/shivam-sentences/{idx + 1}.wav"
    res = model.transcribe(file, language="en")
    outputs.append(res)
output_text = [x["text"].strip() for x in outputs]
output_wer = [wer(p[0], p[1]) for p in zip(clean_sentences, output_text)]
>>> output_wer
[0.043478260869565216, 0.0, 0.0, 1.75, 0.06666666666666667, 0.0, 0.3684210526315789, 0.045454545454545456, 0.18181818181818182, 0.125]
>>> output_text
['so we decided to go to the zoo and the first thing we saw was the lions they were just lazing around', "oh and you won't believe the size of the elephants they were huge", 'we were walking past the monkey enclosure when one of them seriously it started mimicking us', 'and i was also very very money in the similar silver right or part it was i guess that they were told that they would ask you to pantane with him', 'and the penguins they were so cute like waddling around i could watch them all day', 'i was a bit scared of the snakes not gonna lie every time one moved i just jumped a little', 'um uh oh and the food there uh surprisingly good uh we had these amazing uh burgers right by the garage area uh', 'the kids were just fascinated by the aquarium section all those colorful fish and the sharks were a bit scary though', 'you should have seen your face when that parrot started talking priceless', 'and at the end of the day we were all just so tired but you know it was a great day woo']
outputs_self_prompt = []
for idx, sentence in enumerate(sentences):
    file=f"/home/joregan/shivam-sentences/{idx + 1}.wav"
    res = model.transcribe(file, language="en", initial_prompt=sentence)
    outputs_self_prompt.append(res)
output_text = [clean_sentence(x["text"].strip()) for x in outputs_self_prompt]
output_wer = [wer(p[0], p[1]) for p in zip(clean_sentences, output_text)]
>>> output_wer
[0.08695652173913043, 0.0, 0.0, 1.6875, 1.5333333333333334, 0.0, 1.1578947368421053, 0.0, 0.0, 0.08333333333333333]
>>> output_text
['uh so we decided to go to the zoo and uh the first thing we saw was the lions um they were just lazing around', "oh and you won't believe the size of the elephants they were huge", 'we were walking past the monkey enclosure when one of them seriously it started mimicking us', 'and i was also very money in a similar silver right or part it was i guess that they were told that they would ask you to pantane with him', "i was like i'm going to go to the zoo i'm going to go to the zoo i'm going to go to the zoo", 'i was a bit scared of the snakes not gonna lie every time one moved i just jumped a little', "the food was so good really i'm so sad to hear this stuff over konnte like another度 doe you in love dana", 'the kids were just fascinated by the aquarium section all those colorful fish and uh the sharks were a bit scary though', "you should've seen your face when that parrot started talking priceless", 'and uh at the end of the day we were all just so tired but you know it was a great day woo really special']
def prune_fillers(text):
    FILLERS = ["uh", "um"]
    words = [x for x in text.split(" ") if x not in FILLERS]
    return " ".join(words)
output_text = [clean_sentence(x["text"].strip()) for x in outputs]
output_nofill = [prune_fillers(x) for x in output_text]
output_wer = [wer(p[0], p[1]) for p in zip(clean_sentences, output_nofill)]
>>> output_wer
[0.043478260869565216, 0.0, 0.0, 1.75, 0.06666666666666667, 0.0, 0.15789473684210525, 0.045454545454545456, 0.18181818181818182, 0.125]
output_text = [clean_sentence(x["text"].strip()) for x in outputs_self_prompt]
output_nofill = [prune_fillers(x) for x in output_text]
output_wer = [wer(p[0], p[1]) for p in zip(clean_sentences, output_nofill)]
>>> output_wer
[0.043478260869565216, 0.0, 0.0, 1.6875, 1.5333333333333334, 0.0, 1.1578947368421053, 0.045454545454545456, 0.0, 0.08333333333333333]
>>> output_nofill
['so we decided to go to the zoo and the first thing we saw was the lions they were just lazing around', "oh and you won't believe the size of the elephants they were huge", 'we were walking past the monkey enclosure when one of them seriously it started mimicking us', 'and i was also very money in a similar silver right or part it was i guess that they were told that they would ask you to pantane with him', "i was like i'm going to go to the zoo i'm going to go to the zoo i'm going to go to the zoo", 'i was a bit scared of the snakes not gonna lie every time one moved i just jumped a little', "the food was so good really i'm so sad to hear this stuff over konnte like another度 doe you in love dana", 'the kids were just fascinated by the aquarium section all those colorful fish and the sharks were a bit scary though', "you should've seen your face when that parrot started talking priceless", 'and at the end of the day we were all just so tired but you know it was a great day woo really special']
output_nofill = ['so we decided to go to the zoo and the first thing we saw was the lions they were just lazing around', "oh and you won't believe the size of the elephants they were huge", 'we were walking past the monkey enclosure when one of them seriously it started mimicking us', 'and i was also very money in a similar silver right or part it was i guess that they were told that they would ask you to pantane with him', "i was like i'm going to go to the zoo i'm going to go to the zoo i'm going to go to the zoo", 'i was a bit scared of the snakes not gonna lie every time one moved i just jumped a little', "the food was so good really i'm so sad to hear this stuff over konnte like another度 doe you in love dana", 'the kids were just fascinated by the aquarium section all those colorful fish and the sharks were a bit scary though', "you should've seen your face when that parrot started talking priceless", 'and at the end of the day we were all just so tired but you know it was a great day woo really special']
for p in zip(sentences, output_nofill):
    print(p[0])
    print(p[1])
    print()
So, we decided to go to the zoo, and... uh, the first thing we saw was the lions. [laughs] They were just lazing around.
so we decided to go to the zoo and the first thing we saw was the lions they were just lazing around

[gasps] Oh, and you won't believe the size of the elephants. They were HUGE.
oh and you won't believe the size of the elephants they were huge

We were walking past the monkey enclosure when one of them [laughs]—seriously, it started mimicking us!
we were walking past the monkey enclosure when one of them seriously it started mimicking us

[sighs] The weather was kinda hot, but... it was worth it, seeing all those animals up close.
and i was also very money in a similar silver right or part it was i guess that they were told that they would ask you to pantane with him

And the penguins, they were so cute, waddling around. [laughs] I could watch them all day.
i was like i'm going to go to the zoo i'm going to go to the zoo i'm going to go to the zoo

I was a bit scared of the snakes, not gonna lie. Every time one moved, I just... [gasps] jumped a little.
i was a bit scared of the snakes not gonna lie every time one moved i just jumped a little

[clears throat] Oh, and the food there? Surprisingly good. We had these amazing burgers right by the giraffe area.
the food was so good really i'm so sad to hear this stuff over konnte like another度 doe you in love dana

The kids were just fascinated by the aquarium section. All those colorful fish and... uh, the sharks were a bit scary, though.
the kids were just fascinated by the aquarium section all those colorful fish and the sharks were a bit scary though

[laughter] You should've seen your face when that parrot started talking! Priceless.
you should've seen your face when that parrot started talking priceless

And... um, at the end of the day, we were all just so tired, but, you know, it was a great day. [sighs] Really special.
and at the end of the day we were all just so tired but you know it was a great day woo really special

sentences[3]
'[sighs] The weather was kinda hot, but... it was worth it, seeing all those animals up close.'
> >> model = whisper.load_model("medium.en")
100%|█████████████████████████████████████| 1.42G/1.42G [02:01<00:00, 12.6MiB/s]> >> outputs = []
>>> for idx, sentence in enumerate(sentences):...     file=f"/home/joregan/shivam-sentences/{idx + 1}.wav"...     res = model.transcribe(file, language="en")
...     outputs.append(res)
... 
>>> output_text = [clean_sentence(x["text"].strip()) for x in outputs]
>>> output_nofill = [prune_fillers(x) for x in output_text]
>>> output_wer = [wer(p[0], p[1]) for p in zip(clean_sentences, output_nofill)]
>>> output_wer
[0.043478260869565216, 0.07692307692307693, 0.375, 1.75, 0.06666666666666667, 0.0, 0.15789473684210525, 0.045454545454545456, 0.18181818181818182, 0.125]
import stable_whisper

model = stable_whisper.load_model('medium.en')

outputs = []
for idx, sentence in enumerate(sentences):
    file=f"/home/joregan/shivam-sentences/{idx + 1}.wav"
    res = model.transcribe(file, language="en")
    outputs.append(res)

for output in outputs:
    for word in output.all_words():
        print(f"{word._start}\t{word._end}\t{word.word.strip()}")
from pathlib import Path
import argparse
import stable_whisper


def get_args():
    parser = argparse.ArgumentParser("Transcribe a directory of wav files")

    parser.add_argument("wav_directory", type=Path)
    parser.add_argument("tsv_directory", nargs="?", type=Path)

    args = parser.parse_args()
    return args


def main():
    args = get_args()

    indir = args.wav_directory
    if args.tsv_directory:
        outdir = args.tsv_directory
    else:
        outdir = indir

    model = stable_whisper.load_model('medium.en')

    for wav in indir.glob("*.wav"):
        output = model.transcribe(str(wav), language="en")
        outfile = outdir / f"{wav.stem}.tsv"
        with open(outfile, "w") as of:
            for word in output.all_words():
                of.write(f"{word._start}\t{word._end}\t{word.word.strip()}\n")


if __name__ == '__main__':
        main()
from pathlib import Path
import argparse
import whisperx
import torch


def get_args():
    parser = argparse.ArgumentParser("Transcribe a directory of wav files")

    parser.add_argument("wav_directory", type=Path)
    parser.add_argument("tsv_directory", nargs="?", type=Path)

    args = parser.parse_args()
    return args


def main():
    args = get_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"

    indir = args.wav_directory
    if args.tsv_directory:
        outdir = args.tsv_directory
    else:
        outdir = indir

    model = whisperx.load_model("medium.en", device=device)

    for wav in indir.glob("*.wav"):
        audio = whisperx.load_audio(str(wav))
        output = model.transcribe(audio, language="en")
        model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
        result = whisperx.align(output["segments"], model_a, metadata, audio, device, return_char_alignments=False)
        outfile = outdir / f"{wav.stem}.tsv"
        with open(outfile, "w") as of:
            if "word_segments" in result:
                for res in result["word_segments"]:
                    of.write(f'{res["start"]}\t{res["end"]}\t{res["word"]}\n')


if __name__ == '__main__':
        main()