Expand Connaught lexicon
Explicitly models schwa deletion, and using pronunciation of dialectal replacements
dictionary = open("/home/jim/Playing/mfa_alignments/snc.dict")
dictionary2 = open("/home/jim/Playing/mfa_alignments/snc.dict.exp", "w")
alts = {
"acu": "acub",
"againn": "ainn",
"agam": "am",
"agamsa": "amsa",
"agat": "ad",
"agatsa": "adsa",
"arís": "aríst",
"bóthar": "bór",
"ceistigh": "ceisnigh",
"claí": "cladh",
"cluiche": "cluife",
"contae": "condae",
"croitheadh": "crathadh",
"dada": "tada",
"daoibh": "dhaoib",
"de": "dhe",
"de": "ge",
"deartháir": "driotháir",
"dheirfiúr": "dhrifiúr",
"dheirfiúracha": "dhrifiúracha",
"dom": "dhom",
"domsa": "dhomsa",
"droichead": ["draed", "draighead"],
"duit": "dhuit",
"duitse": "dhuitse",
"díbh": "díofa",
"díobh": "díob",
"dócha": "dóiche",
"dóibh": "dóib",
"dóigh": "dóiche",
"dúinn": "dhúinn",
"féin": "fhéin",
"foighne": "foighid",
"folach": "falach",
"foscadh": "fascadh",
"gnaithe": "gnaíthe",
"iúdás": "iúdas",
"leo": ["leob", "leofa"],
"léi": ["léithe", "léí"],
"litir": "leitir",
"luigh": "loigh",
"luí": "loighe",
"mé": "me",
"naimhdeach": "náimhdeach",
"namhaid": "náimhid",
"nuacht": "nuaíocht",
"nuachta": "nuaíochta",
"nóiméad": ["móiméad", "mhóiméad"],
"nóiméid": ["móiméid", "mhóiméid"],
"orthu": "orthub",
"scafánta": "scufánta",
"scornach": "scórnach",
"sé": "se",
"sibh": "sib",
"taispeáin": "taspáin",
"taispeánfaidh": "taspánfaidh",
"teacht": "tíocht",
"theacht": "thíocht",
}
nonwords = {
"bór": "b oo r",
"draed": "d r ee d",
"draighead": "d r ai d",
"ge": "g @",
"léí": "lj ee ii",
}
maybe_missing = {
"duit": "d i tj",
"nuaíocht": "n uu i@ x t",
"nuaíochta": "n uu i@ x t @",
"am": "a m", # a'm
"móiméad": "m oo mj ee d",
"móiméid": "m oo mj ee dj",
"mhóiméad": "v oo mj ee d",
"mhóiméid": "v oo mj ee dj",
"taspánfaidh": "t @ s p aa nn h @", # 0 t @ s . 1 p aa nn . 0 h @
}
_ALTS = {**nonwords, **maybe_missing}
_SOUGHT = []
_SKIP_ALTS = []
for (a, b) in alts.items():
if type(b) == list:
for x in b:
if x not in _ALTS.keys():
_SOUGHT.append(x)
else:
if b not in _ALTS.keys():
_SOUGHT.append(b)
_REVERSE_ALTS = {}
for item in alts.items():
if type(item[1]) == list:
items = item[1]
else:
items = [item[1]]
for sitem in items:
if sitem not in _REVERSE_ALTS.keys():
_REVERSE_ALTS[sitem] = set()
_REVERSE_ALTS[sitem].add(item[0])
def deletable_schwa_single(word, phones):
out = []
out.append((word, phones))
if len(phones) == 1 and phones[0] == '@':
out.append((word, ["sil"]))
else:
if phones[0] == '@':
out.append((word, phones[1:]))
if phones[-1] == '@':
out.append((word, phones[1:-1]))
if phones[-1] == '@':
out.append((word, phones[:-1]))
return out
def deletable_schwa(wordlist):
out = []
for item in wordlist:
out += deletable_schwa_single(item[0], item[1])
return out
def nasal_o(item):
# FIXME: way too simplistic
word = item[0]
phones = item[1]
phonestr = " ".join(phones)
out = [item]
if "mó" in word and "m oo" in phonestr:
outph = phonestr.replace("m oo", "m uu").split(" ")
out.append((word, outph))
if "mhó" in word and "v oo" in phonestr:
outph = phonestr.replace("v oo", "v uu").split(" ")
out.append((word, outph))
if "nó" in word and "n oo" in phonestr:
outph = phonestr.replace("n oo", "n uu").split(" ")
out.append((word, outph))
return out
def endswith_list(text, endings):
for ending in endings:
if text.endswith(ending):
return True
return False
def handle_igh(item):
if type(item) != tuple:
raise Exception("item is not a tuple: " + item)
word = item[0]
phones = item[1]
out = [item]
if word.endswith("igh") and phones[-1] == "@":
out.append((word, phones[0:-1] + ["ii"]))
if word.endswith("igh") and phones[-1] == "ii":
if not endswith_list(word, ["uigh", "aoigh"]):
out.append((word, phones[0:-1] + ["@"]))
if word.endswith("dh") and phones[-1] == "@":
out.append((word, phones + ["x"]))
out.append((word, phones + ["tj"]))
out.append((word, phones + ["v"]))
if word.endswith("dh") and phones[-1] == "x":
out.append((word, phones[0:-1]))
out.append((word, phones[0:-1] + ["tj"]))
out.append((word, phones[0:-1] + ["v"]))
return out
for line in dictionary.readlines():
line = line.strip()
pieces = line.split("\t")
word = pieces[0]
phones = pieces[1].split(" ")
entries = list()
tmptup = (word, phones)
entries.append(tmptup)
if word in _SOUGHT:
for replacement_word in _REVERSE_ALTS[word]:
tmp_replace = [(replacement_word, b) for (a, b) in entries]
entries.extend(tmp_replace)
elif word in alts.keys():
if type(alts[word]) == list:
tmp_words = alts[word]
else:
tmp_words = [alts[word]]
for tmp_word in tmp_words:
if tmp_word not in _SOUGHT:
entries.append((word, _ALTS[tmp_word].split(" ")))
if word.endswith("acha") or word.endswith("anna"):
entries.append((word, phones[:-1] + ["ii"]))
if word.endswith("igh") or word.endswith("dh"):
tmp_igh = []
for entry in entries:
tmp_igh.extend(handle_igh(entry))
entries.extend(tmp_igh)
tmp_nasal = []
for entry in entries:
tmp_nasal.extend(nasal_o(entry))
entries.extend(tmp_nasal)
tmp_schwa = deletable_schwa(entries)
entries.extend(tmp_schwa)
joined = [" ".join([a] + b) for (a, b) in entries]
sort_join = sorted(joined)
for entry in set(sort_join):
dictionary2.write(entry + "\n")
dictionary.close()
dictionary2.close()