PROMPT = """
But then if you want to to remove some of the pieces, I think ah the the two ah pieces, the one the red one and the brown one there, eh those are the one down there.
Mhm.
Mm.
Hmm.
It's like a base. Ehm. They mm they are too flat I think for for the rest the rest are more like vertical pieces and those are super horizontal pieces then it's not um... It's not working for the place I think.
Um yeah they are a little bit creepy but I mean it's it's a matter of uh choice i- if you don't like them you can actually remove them I think that that would be totally fine even maybe you can actually put some of these pieces there and then you can maybe have books here.
But I don't think so. I think it looks nice. It's a nice division and it it gives you a nice view of the... I mean, it's a cen- it's a focus point that then leads you to the... to the green garden, yes.
Uh so yeah, but but yeah, I mean, sometimes, eh whenever it's not, I mean, eh there is not that much light, I take them out in the balcony.
""".strip()

audio_dir = "/home/joregan/hsi/audio"
json_dir = "/tmp/whisperx-json"
whisper_model = "large-v2"
align_model = "facebook/wav2vec2-large-robust-ft-swbd-300h"
prompt = ""

compute_type = "float16"
batch_size = 16

vad_onset = 0.500
vad_offset = 0.363
chunk_size = 30

faster_whisper_threads = 4

interpolate_method = "nearest"

skip_files = ""

import torch
if torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

if not prompt or prompt == "":
    prompt = PROMPT
skip_files = skip_files.split(" ")

default_asr_options =  {
    "beam_size": 5,
    "best_of": 5,
    "patience": 1,
    "length_penalty": 1,
    "repetition_penalty": 1,
    "no_repeat_ngram_size": 0,
    "temperatures": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    "compression_ratio_threshold": 2.4,
    "log_prob_threshold": -1.0,
    "no_speech_threshold": 0.6,
    "condition_on_previous_text": False,
    "prompt_reset_on_temperature": 0.5,
    "initial_prompt": None,
    "prefix": None,
    "suppress_blank": True,
    "suppress_tokens": [-1],
    "without_timestamps": True,
    "max_initial_timestamp": 0.0,
    "word_timestamps": False,
    "prepend_punctuations": "\"'“¿([{-",
    "append_punctuations": "\"'.。,，!！?？:：”)]}、",
    "suppress_numerals": False,
    "max_new_tokens": None,
    "clip_timestamps": None,
    "hallucination_silence_threshold": None,
}

from pathlib import Path
import gc
import whisperx

results = []
tmp_results = []

model = whisperx.load_model(whisper_model, device=DEVICE, compute_type=compute_type, language="en", asr_options=default_asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset}, task="transcribe", threads=faster_whisper_threads)

for wavfile in Path(audio_dir).glob("*.wav"):
    if wavfile.name in skip_files or wavfile.stem in skip_files:
        continue
    audio_path = str(wavfile)
    audio = whisperx.load_audio(audio_path)
    result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size, print_progress=True)
    results.append((result, audio_path))


del model
gc.collect()
torch.cuda.empty_cache()

align_model, align_metadata = whisperx.load_align_model("en", DEVICE, model_name=align_model)

tmp_results = results
results = []
for result, audio_path in tmp_results:
    if len(tmp_results) > 1:
        input_audio = audio_path
    else:
        # lazily load audio from part 1
        input_audio = audio

    if align_model is not None and len(result["segments"]) > 0:
        if result.get("language", "en") != align_metadata["language"]:
            # load new language
            print("Error: found something other than English", audio_path)
        print(">>Performing alignment...")
        result = whisperx.align(result["segments"], align_model, align_metadata, input_audio, DEVICE, interpolate_method=interpolate_method, return_char_alignments=False, print_progress=True)

    results.append((result, audio_path))

# Unload align model
del align_model
gc.collect()
torch.cuda.empty_cache()

import json

json_path = Path(json_dir)
if not json_path.is_dir():
    json_path.mkdir()

for result, audio_path in results:
    audio_stem = Path(audio_path).stem
    json_file = json_path / f"{audio_stem}.json"
    with open(str(json_file), "w") as outfile:
        json.dump(result, outfile, ensure_ascii=False)