WhisperX JSON segments to textgrid
For HSI project
%pip install praatio
from praatio import textgrid
from praatio.utilities.constants import Interval
input = "/Users/joregan/Playing/hsi/test2/whisperx-json"
output = '/Users/joregan/Playing/hsi/test2/whisperx-json/textgrids'
from pathlib import Path
import json
IS_DIR = False
input_path = Path(input)
output_path = Path(output)
if input_path.is_dir():
IS_DIR = True
assert not output_path.exists(), "f{output} exists, refusing to overwrite"
output_path.mkdir()
def pad_silences(segments):
start = 0.0
out = []
for segment in segments:
out.append((start, segment[0], " "))
out.append(segment)
start = segment[1]
return out
def write_converted(infile, outfile, verbose=True):
if type(infile) == Path:
infile = str(infile)
if type(outfile) == Path:
outfile = str(outfile)
with open(infile) as inf:
data = json.load(inf)
out = []
if not "segments" in data:
if verbose:
print("File", infile, "possibly incorrect JSON")
return
if len(data["segments"]) == 0:
if verbose:
print("File", infile, "contains no segments")
return
for segment in data["segments"]:
if segment["start"] > segment["end"]:
if verbose:
print("File", infile, "has start time after end time")
print(segment)
return
out.append(Interval(segment["start"], segment["end"], segment["text"]))
tier_start = out[0][0]
tier_end = out[-1][1]
tg = textgrid.Textgrid()
word_tier = textgrid.IntervalTier('words', pad_silences(out), tier_start, tier_end)
tg.addTier(word_tier)
tg.save(outfile, format="long_textgrid", includeBlankSpaces=False)
if not IS_DIR:
write_converted(input, output)
else:
for file in input_path.glob("*.json"):
tgpath = output_path / f"{file.stem}.TextGrid"
write_converted(file, tgpath)