Extract SLT metadata
The data has been updated, so this is obsolete
!pip install ffmpeg-python
_BASE = "/Users/joregan/asr/slt/audio"
import json
from pathlib import Path
slt = {}
with open("/Users/joregan/asr/slt.json") as slt_file:
for line in slt_file.readlines():
linedata = json.loads(line)
slt_id = Path(linedata["path"]).stem
slt[slt_id] = linedata
from pathlib import Path
base = Path(_BASE)
for shn in base.glob("**/*.shn"):
meta = {}
meta["id"] = shn.stem
probe = ffmpeg.probe(shn)
if not "format" in probe and not "tags" in probe["format"]:
continue
tags = probe["format"]["tags"]
if "Gender" in tags:
meta["gender"] = tags["Gender"]
if "UserID" in tags:
meta["user_id"] = tags["UserID"]
if "Dialect" in tags:
meta["dialect"] = tags["Dialect"]
if "recording_date" in tags:
meta["recording_date"] = tags["recording_date"]
if "recording_time" in tags:
meta["recording_time"] = tags["recording_time"]
if meta["id"] in slt:
slt[meta["id"]].update(meta)
else:
slt[meta["id"]] = meta
with open("slt-meta.json", "w") as slt_out:
for item in slt.keys():
slt_out.write(json.dumps(slt[item]) + "\n")