Split Braxen by language
I thought this would be useful; unfortunately not, the data is not suitable for anything
from pathlib import Path
BRAXEN_DICT = Path("/Users/joregan/Playing/braxen/dict/braxen-sv.tsv")
data = {}
with open(BRAXEN_DICT) as bf:
for line in bf:
if not "\t" in line:
continue
parts = line.split("\t")
lang = parts[3]
word = parts[0]
phon = parts[1]
if lang == "unk" or lang == "":
continue
if not lang in data:
data[lang] = {}
if not word in data[lang]:
data[lang][word] = []
data[lang][word].append(phon)
def clean_phones(phone_string):
phones = phone_string.split(" ")
phones = [p for p in phones if not p in ["~", "|", ".", "-"]]
phones = [p[1:] if p and p[0:1] in [",", "'", '"'] else p for p in phones]
return " ".join(phones)
for lang in data:
out_path = Path(f"/tmp/braxen-{lang}.txt")
with open(out_path, "w") as of:
for word in sorted(data[lang].keys()):
for prons in set(data[lang][word]):
phones = clean_phones(prons)
of.write(f"{word}\t{phones}\n")