Waxholm lexicon processing
I can't remember what this was for; I'm sure I'll be reminded
import json
with open("/tmp/waxholm_raw_lexicon.json") as lexjson:
data = json.load(lexjson)
def simplify_stops(text):
text = text.replace("Kk", "K")
text = text.replace("Gg", "G")
text = text.replace("Dd", "D")
text = text.replace("Tt", "T")
text = text.replace("Bb", "B")
text = text.replace("Pp", "P")
text = text.replace("k", "K")
text = text.replace("Kl", "kl")
text = text.replace("g", "G")
text = text.replace("d", "D")
text = text.replace("t", "T")
text = text.replace("b", "B")
text = text.replace("p", "P")
text = text.replace("Pa", "pa")
text = text.replace("P:", "p:")
return text
def simplify_phoneme(text):
text = text.replace("+", "")
text = text.replace("hy", "#")
return text
def segment_label(label, skip_pause=True):
phones = []
i = 0
while i < len(label):
start_i = i
end_i = i
if label[i:i+2] in ["NG", "E0", "SJ", "TJ", "kl", "sm", "pa", "ha", "öh", "Pa", "p:_pa", "pa_p:"]:
phones.append(label[i:i+2])
i += 2
elif label[i:i+2] == "p:":
if not skip_pause:
phones.append("p:")
i += 2
elif label[i:i+1] in ["#", "~"]:
i += 1
else:
if label[i:i+1] in ["'", "`", "\"", ",", "2"]:
i += 1
end_i += 1
if label[i+1:i+2] in [":", "3", "4"]:
end_i += 1
phones.append(label[start_i:end_i+1])
i = end_i + 1
return phones
def lclem(lower):
if lower[0] == lower[-1] == "X":
return lower
else:
return lower.lower()
data[0]
X_TAGS = {
"XtvekX": "öh",
"XinandX": "pa",
"XsmackX": "sm",
"XutandX": "pa",
"XharklingX": "ha",
"XklickX": "kl",
"XavbrordX": "",
"XskrattX": "ha",
"XsuckX": "pa"
}
def check_x_tag(word, phoneme):
if word == "XavbrordX":
return True
if word in X_TAGS:
return phoneme == X_TAGS[word]
def modify_phonemes_inner(word, phonemes, idx):
if not word in X_TAGS:
return phonemes
elif word == "XavbrordX":
return phonemes
if not check_x_tag(word, phonemes[idx]):
if idx == -1:
return phonemes + [X_TAGS[word]]
else:
return phonemes[0:idx] + [X_TAGS[word]] + phonemes[idx:]
wds = "XinandX lördag".split(" ")
phn = "L'Ö32DA".split(" ")
assert modify_phonemes_inner(wds[0], phn, 0) == ['pa', "L'Ö32DA"]
wds = "lördag XinandX".split(" ")
phn = "L'Ö32DA".split(" ")
assert modify_phonemes_inner(wds[1], phn, -1) == ["L'Ö32DA", 'pa']
def modify_phonemes(words, phonemes):
i = 0
if phonemes is None or phonemes == [] or len(phonemes) == 0:
print("Error with phonemes", phonemes)
return []
my_phonemes = phonemes
assert isinstance(my_phonemes, list)
assert my_phonemes is not None
while i < len(words):
if my_phonemes is None:
print(words, phonemes, my_phonemes, type(my_phonemes))
if i >= len(my_phonemes):
p_i = -1
else:
p_i = i
my_phonemes = modify_phonemes_inner(words[i], my_phonemes, p_i)
i += 1
return my_phonemes
for entry in data:
if not "phoneme" in entry:
continue
if entry["phoneme"] == "":
continue
phonemes = entry["phoneme"].strip().split(" ")
phonemes_orig = " ".join(phonemes)
words = entry["text"].strip().split(" ")
labels = entry["labels"].strip().split(" ")
if phonemes == None or phonemes == []:
continue
if len(words) != len(phonemes):
mod = modify_phonemes(words, phonemes)
if " ".join(mod) != phonemes_orig:
entry["phoneme_orginal"] = entry["phoneme"]
entry["phoneme"] = " ".join(mod)
with open("/tmp/waxholm_autoedit.json", "w") as autoedit:
json.dump(data, autoedit)
entries = {}
rest = []
for item in data:
if not "phoneme" in item:
continue
if item["labels"].startswith("sm") and not item["text"].startswith("XsmackX"):
item["text"] = f'XsmackX {item["text"]}'
elif item["labels"].startswith("öh") and not item["text"].startswith("XtvekX"):
item["text"] = f'XtvekX {item["text"]}'
# elif item["labels"].startswith("pa") and not item["text"].startswith("XutandX"):
# item["text"] = f'XutandX {item["text"]}'
phonemes = simplify_phoneme(item["phoneme"]).split(" ")
labels = simplify_stops(item["labels"]).split(" ")
words = [lclem(x) for x in item["text"].split(" ")]
if len(phonemes) == len(labels) == len(words):
curword = {}
for x in zip(words, phonemes, labels):
if not x[0] in entries:
entries[x[0]] = {}
if not x[1] in entries[x[0]]:
entries[x[0]][x[1]] = {}
if not x[2] in entries[x[0]][x[1]]:
entries[x[0]][x[1]][x[2]] = set()
entries[x[0]][x[1]][x[2]].add(item["stem"])
else:
rest.append(item)
for a in entries:
for b in entries[a]:
for c in entries[a][b]:
for d in entries[a][b][c]:
entries[a][b][c] = list(entries[a][b][c])
with open("/tmp/simple-aligned-entries.json", "w") as simplef:
json.dump(entries, simplef)
len(rest)
smacks = []
uh = []
for item in rest:
if item["labels"].startswith("sm") and not item["text"].startswith("XsmackX"):
smacks.append(item)
elif item["labels"].startswith("öh") and not item["text"].startswith("XtvekX"):
uh.append(item)