WhisperX prompted
Trying to get better output from WhisperX by prompting and using a wav2vec model trained on more spontaneous speech
PROMPT = """
But then if you want to to remove some of the pieces, I think ah the the two ah pieces, the one the red one and the brown one there, eh those are the one down there.
Mhm.
Mm.
Hmm.
It's like a base. Ehm. They mm they are too flat I think for for the rest the rest are more like vertical pieces and those are super horizontal pieces then it's not um... It's not working for the place I think.
Um yeah they are a little bit creepy but I mean it's it's a matter of uh choice i- if you don't like them you can actually remove them I think that that would be totally fine even maybe you can actually put some of these pieces there and then you can maybe have books here.
But I don't think so. I think it looks nice. It's a nice division and it it gives you a nice view of the... I mean, it's a cen- it's a focus point that then leads you to the... to the green garden, yes.
Uh so yeah, but but yeah, I mean, sometimes, eh whenever it's not, I mean, eh there is not that much light, I take them out in the balcony.
""".strip()
audio_dir = "/home/joregan/hsi/audio"
json_dir = "/tmp/whisperx-json"
whisper_model = "large-v2"
align_model = "facebook/wav2vec2-large-robust-ft-swbd-300h"
prompt = ""
compute_type = "float16"
batch_size = 16
vad_onset = 0.500
vad_offset = 0.363
chunk_size = 30
faster_whisper_threads = 4
interpolate_method = "nearest"
skip_files = ""
import torch
if torch.cuda.is_available():
DEVICE = "cuda"
else:
DEVICE = "cpu"
if not prompt or prompt == "":
prompt = PROMPT
skip_files = skip_files.split(" ")
default_asr_options = {
"beam_size": 5,
"best_of": 5,
"patience": 1,
"length_penalty": 1,
"repetition_penalty": 1,
"no_repeat_ngram_size": 0,
"temperatures": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
"compression_ratio_threshold": 2.4,
"log_prob_threshold": -1.0,
"no_speech_threshold": 0.6,
"condition_on_previous_text": False,
"prompt_reset_on_temperature": 0.5,
"initial_prompt": None,
"prefix": None,
"suppress_blank": True,
"suppress_tokens": [-1],
"without_timestamps": True,
"max_initial_timestamp": 0.0,
"word_timestamps": False,
"prepend_punctuations": "\"'“¿([{-",
"append_punctuations": "\"'.。,,!!??::”)]}、",
"suppress_numerals": False,
"max_new_tokens": None,
"clip_timestamps": None,
"hallucination_silence_threshold": None,
}
from pathlib import Path
import gc
import whisperx
results = []
tmp_results = []
model = whisperx.load_model(whisper_model, device=DEVICE, compute_type=compute_type, language="en", asr_options=default_asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset}, task="transcribe", threads=faster_whisper_threads)
for wavfile in Path(audio_dir).glob("*.wav"):
if wavfile.name in skip_files or wavfile.stem in skip_files:
continue
audio_path = str(wavfile)
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size, print_progress=True)
results.append((result, audio_path))
del model
gc.collect()
torch.cuda.empty_cache()
align_model, align_metadata = whisperx.load_align_model("en", DEVICE, model_name=align_model)
tmp_results = results
results = []
for result, audio_path in tmp_results:
if len(tmp_results) > 1:
input_audio = audio_path
else:
# lazily load audio from part 1
input_audio = audio
if align_model is not None and len(result["segments"]) > 0:
if result.get("language", "en") != align_metadata["language"]:
# load new language
print("Error: found something other than English", audio_path)
print(">>Performing alignment...")
result = whisperx.align(result["segments"], align_model, align_metadata, input_audio, DEVICE, interpolate_method=interpolate_method, return_char_alignments=False, print_progress=True)
results.append((result, audio_path))
# Unload align model
del align_model
gc.collect()
torch.cuda.empty_cache()
import json
json_path = Path(json_dir)
if not json_path.is_dir():
json_path.mkdir()
for result, audio_path in results:
audio_stem = Path(audio_path).stem
json_file = json_path / f"{audio_stem}.json"
with open(str(json_file), "w") as outfile:
json.dump(result, outfile, ensure_ascii=False)