Run VibeVoice on Librivox
For sanity(?)
#!/usr/bin/env python3
import argparse, json, time
from pathlib import Path
import torch
from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
AUDIO_EXTS = {".wav",".mp3",".flac",".mp4",".m4a",".webm",".ogg",".opus"}
def list_audio_files(audio_dir: str) -> list[str]:
root = Path(audio_dir)
return sorted(str(p) for p in root.rglob("*") if p.is_file() and p.suffix.lower() in AUDIO_EXTS)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model_path", required=True)
ap.add_argument("--audio_dir", default="")
ap.add_argument("--audio_files", nargs="*", default=[])
ap.add_argument("--out_dir", required=True)
ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu",
choices=["cuda","cpu","mps","xpu","auto"])
ap.add_argument("--attn_implementation", default="sdpa",
choices=["flash_attention_2","sdpa","eager"])
ap.add_argument("--max_new_tokens", type=int, default=32768)
ap.add_argument("--temperature", type=float, default=0.0)
ap.add_argument("--top_p", type=float, default=1.0)
ap.add_argument("--num_beams", type=int, default=1)
args = ap.parse_args()
files = []
if args.audio_dir:
files.extend(list_audio_files(args.audio_dir))
files.extend(args.audio_files)
if not files:
raise SystemExit("No audio files found. Use --audio_dir or --audio_files.")
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
dtype = torch.float32 if args.device in ("cpu","mps","xpu") else torch.bfloat16
processor = VibeVoiceASRProcessor.from_pretrained(args.model_path)
model = VibeVoiceASRForConditionalGeneration.from_pretrained(
args.model_path,
dtype=dtype,
device_map=args.device if args.device == "auto" else None,
attn_implementation=args.attn_implementation,
trust_remote_code=True,
)
if args.device != "auto":
model = model.to(args.device)
device = args.device
else:
device = next(model.parameters()).device
model.eval()
do_sample = args.temperature > 0
for f in files:
inputs = processor(
audio=f,
sampling_rate=None,
return_tensors="pt",
add_generation_prompt=True,
)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k,v in inputs.items()}
gen_cfg = {
"max_new_tokens": args.max_new_tokens,
"do_sample": do_sample,
"num_beams": args.num_beams,
"pad_token_id": processor.pad_id,
"eos_token_id": processor.tokenizer.eos_token_id,
}
if do_sample:
gen_cfg["temperature"] = args.temperature
gen_cfg["top_p"] = args.top_p
t0 = time.time()
with torch.no_grad():
output_ids = model.generate(**inputs, **gen_cfg)
_ = time.time() - t0 # not used; you asked for JSON only
generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
raw_text = processor.decode(generated_ids, skip_special_tokens=True)
# parse into segments
try:
segments = processor.post_process_transcription(raw_text)
except Exception:
segments = []
# convert to Gradio-style JSON list
out_list = []
for seg in segments:
item = {
"Start": seg.get("start_time"),
"End": seg.get("end_time"),
"Content": seg.get("text", ""),
}
spk = seg.get("speaker_id", None)
if spk is not None:
item["Speaker"] = spk
out_list.append(item)
out_path = out_dir / (Path(f).stem + ".json")
out_path.write_text(json.dumps(out_list, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✅ wrote {out_path} ({len(out_list)} segments)")
if __name__ == "__main__":
main()
Running with:
for d in /data/*/*; do if [ -d $d ];then for f in $d/*.mp3;do json=$(echo $f|sed -e 's/mp3$/json/'); if [ ! -e $json ];then python vibevoice/scripts/transcribe.py --model_path /app/models/VibeVoice-ASR --audio_files $f --out_dir $d;fi;done;fi;done
done
ChatGPT cleanup:
for d in /data/*/*; do
if [ -d "$d" ]; then
for f in "$d"/*.mp3; do
[ -e "$f" ] || continue # skip if no mp3s matched
json="${f%.mp3}.json"
if [ ! -e "$json" ]; then
python vibevoice/scripts/transcribe.py \
--model_path /app/models/VibeVoice-ASR \
--audio_files "$f" \
--out_dir "$d"
fi
done
fi
done