Sjoestedt-Jonval Phonétique
Semi-automated extraction from Marie-Louise Sjoestedt-Jonval's Phonétique d’un parler irlandais de Kerry
The text is on Wikisource
This is a really bad example of pretty much everything, but it covers the most common cases, and the extraction needed to be manually checked anyway.
FORM_MAPS = {
"le génitif": "genitive",
"gen.": "genitive",
"gén.": "genitive",
"gén": "genitive",
"dat.": "dative",
"dat": "dative",
"plur.": "plural",
"plur": "plural",
"futur": "future",
"fut.": "future",
"participe": "participle",
"comp.": "comparative",
"comp": "comparative",
"compar.": "comparative",
"compar": "comparative",
"impér.": "imperative",
"impér": "imperative",
"diminutif": "diminutive",
"vocatif": "vocative",
"gén. fém.": "genitive_feminine",
"superl.": "superlative",
"part.": "participle"
}
import json
with open("sjoestedt_phonetique.json") as inf:
DATA = json.load(inf)
section = 30
if not section in DATA:
DATA[section] = []
DATA[section] = []
import re
REGEX_CASE = rf"^([^(]+) \(([^)]+)\) « ([^»]+) », ({'|'.join(FORM_MAPS.keys())}) ([^(]+) \(([^)]+)\)$"
REGEX_CASE_OF = rf"^([^(]+) \(([^)]+)\),? ({'|'.join(FORM_MAPS.keys())}) de ([^«]+) « ([^»]+) »$"
REGEX_CASE_OF_BOTH = rf"^([^(]+) \(([^)]+)\),? ({'|'.join(FORM_MAPS.keys())}) de ([^«]+) \(([^)]+)\) « ([^»]+) »$"
REGEX_BASE = r"^([^(]+) \(([^)]+)\) « ([^»]+) »$"
REGEX_JUST_FRENCH = r"^([^(]+) « ([^»]+) »$"
REGEX_FROM = r"^([^(]+) \(([^)]+)\) « ([^»]+) »,? de ([^(]+) \(([^)]+)\) « ([^»]+) »$"
REGEX_IN = r"^([^(]+) \(([^)]+)\), dans ([^«]+) « ([^»]+) »$"
REGEX_OR = r"^([^(]+) \(([^)]+)\) ou ([^(]+) \(([^)]+)\) « ([^»]+) »$"
def extract(text, splitter, counter):
text = text.replace("\u00ad", "").replace("\n", " ")
for p in text.split(splitter):
m = re.match(REGEX_BASE, p.strip())
mjf = re.match(REGEX_JUST_FRENCH, p.strip())
mf = re.match(REGEX_FROM, p.strip())
min = re.match(REGEX_IN, p.strip())
mc = re.match(REGEX_CASE, p.strip())
mcofb = re.match(REGEX_CASE_OF_BOTH, p.strip())
mcof = re.match(REGEX_CASE_OF, p.strip())
mor = re.match(REGEX_OR, p.strip())
if m:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": m.group(1),
"irish": m.group(2),
"french": m.group(3),
})
counter += 1
elif mjf:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mjf.group(1),
"french": mjf.group(2),
})
counter += 1
elif mf:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mf.group(1),
"irish": mf.group(2),
"french": mf.group(3).replace("\xad", ""),
"from": mf.group(5)
})
counter += 1
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mf.group(4),
"irish": mf.group(5),
"french": mf.group(6).replace("\xad", ""),
})
counter += 1
elif mor:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mor.group(1),
"irish": mor.group(2),
"french": mor.group(5),
"alt": mor.group(4)
})
counter += 1
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mor.group(3),
"irish": mor.group(4),
"french": mor.group(5),
"alt": mor.group(2)
})
counter += 1
elif min:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": min.group(1),
"irish": min.group(2),
"in": min.group(3)
})
counter += 1
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": min.group(3),
"french": min.group(4),
})
counter += 1
elif mc:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mc.group(1),
"irish": mc.group(2),
"french": mc.group(3),
FORM_MAPS[mc.group(4)]: mc.group(6)
})
counter += 1
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mc.group(5),
"irish": mc.group(6),
f"{FORM_MAPS[mc.group(4)]}_of": mc.group(2)
})
counter += 1
elif mcofb:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mcofb.group(1),
"irish": mcofb.group(2),
f"{FORM_MAPS[mcofb.group(3)]}_of": mcofb.group(5)
})
counter += 1
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"irish": mcofb.group(5),
"transcription": mcofb.group(4),
"french": mcofb.group(6),
FORM_MAPS[mcofb.group(3)]: mcofb.group(2)
})
counter += 1
elif mcof:
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mcof.group(1),
"irish": mcof.group(2),
f"{FORM_MAPS[mcof.group(3)]}_of": mcof.group(4)
})
counter += 1
DATA[section].append({
"section": section,
"id": f"{section}_{counter}",
"transcription": mcof.group(4),
"french": mcof.group(5),
FORM_MAPS[mcof.group(3)]: mcof.group(2)
})
counter += 1
else:
print(p)
print(counter)
REST = {
# 302: "go rᴇ̈: mah əgɑt (go raibh maith agat) « merci »",
321: "vʹi: klᴀᴜn nə gᴜ̃:rsən ɛg i·əsg̬əχ ⸢ɛr fʹαg⸣ ⸤ǥɑ: lᴇ̈:⸥ (bhí clann na gcomharsan ag iascach ar feadh dhá lae) « les fils des voisins ont été à la pêche pendant deux jours »; tɑ: nə prɑ:tɪ: go ⸢hɔlk⸣ (ta na prataí go holc) « les pommes de terre sont mauvaises » ; vʷɪlʹ ᴇ̈ᵊn ᴜ:rɑ̃:n ə nᴇ̈:ᵊχʌr ⸤əgɑt⸥ (an bhfuil aon amhrán i n-aon chor agat ?) « est-ce que tu sais chanter ? » ; fʹì:aχ ⸤ə vʷɪlʹ⸥ ᴇ̈ᵊn fʹiŋʹgʹɩnʹ ⸢ɩkʹi⸣ ⸤ɩ çαno:χ⸥ ⸢pʲaᴜn⸣ ⸤dɔm-sə⸥ (feuch an bhfuil aon phinginn aici a cheannóchadh peann dom-sa) « regarde si elle n’a pas un penny, avec lequel je pourrais m’acheter une plume »"
}
section = 321
if not section in DATA:
DATA[section] = []
text = REST[section].replace(" : ", " ; ")
text = text.replace("\u00ad", "")
text = text.replace("»", " »")
text = text.replace(" »", " »")
text = text.replace(", mais", " ; ")
extract(text, ";", 1)
def split_trans(pos):
if " ou " in DATA[section][pos]["transcription"]:
DATA[section][pos]["transcription"] = DATA[section][pos]["transcription"].split(" ou ")
elif " et " in DATA[section][pos]["transcription"]:
DATA[section][pos]["transcription"] = DATA[section][pos]["transcription"].split(" et ")
elif " à côté de " in DATA[section][pos]["transcription"]:
DATA[section][pos]["transcription"] = DATA[section][pos]["transcription"].split(" à côté de ")
for s in range(len(DATA[section])):
if "transcription" in DATA[section][s]:
split_trans(s)
import json
if str(section) in DATA and section in DATA:
tmp = DATA[str(section)] + DATA[section]
DATA[str(section)] = tmp
del(DATA[section])
with open("sjoestedt_phonetique.json", "w") as outf:
json.dump(DATA, outf)
def french_to_note(pos):
DATA[section][pos]["french_note"] = DATA[section][pos]["french"]
del(DATA[section][pos]["french"])
def contrast(pos, pos1):
if 'irish' in DATA[section][pos1]:
DATA[section][pos]['contrast'] = DATA[section][pos1]['irish']
else:
DATA[section][pos]['contrast'] = DATA[section][pos1]['transcription']
DATA[section][pos]['contrast_id'] = DATA[section][pos1]['id']
def contrast_next(pos):
contrast(pos, pos+1)
def syllabified(item):
if type(item["transcription"]) == str:
if "-" in item["transcription"]:
item["syllabified"] = item["transcription"]
item["transcription"] = item["transcription"].replace("-", "")
else:
if "-" in item["transcription"][0]:
item["syllabified"] = item["transcription"]
item["transcription"] = [x.replace("-", "") for x in item["transcription"]]
def syllabify():
for s in DATA[section]:
syllabified(s)
def intonation(item):
if "⸢" in item["transcription"] or "⸤" in item["transcription"]:
item["intonation"] = item["transcription"]
item["transcription"] = item["transcription"].replace("⸢", "").replace("⸣", "").replace("⸤", "").replace("⸥", "")
def intone():
for s in DATA[section]:
intonation(s)
# DATA[section][0]["transcription"] = "vʹi: klᴀᴜ(n) nə gᴜ̃:rsən ɛg i·əsg̬əχ ⸢ɛr fʹα(g)⸣ ⸤ǥɑ: lᴇ̈:⸥"
intone()
# syllabify()
# contrast(8, 10)
# DATA[264][0]["transcription"] = DATA[264][0]["transcription"].split(", ")
# contrast_next(-3)
# DATA[section].append(
# {
# "section": 301,
# "id": "301_2",
# "transcription": "gʷᴇ̈:h",
# }
# )
# DATA[section][-1]["for"] = "kᴜ̃:nəv"
# DATA[section][-2]["from"] = [DATA[section][-2]["from"], "deallramh"]
# DATA[section][-2]["transcription"] = "ən lɑ̃:v ə ʃi:nʹtər ə ka:ⁱnʹtər"
# del(DATA[section][5]['french'])
# DATA[section][-1]["transcription"] = "ˈʃαχ(t) ˌno:rʃɩ"
# contrast_next(-2)
mods = [
{
"section": 17,
"id": "17_23",
"irish": "lomaim",
"transcription": "lɔmʷɩmʹ",
"french": "je me dépouille, je deviens chauve",
"future": "lomfad"
},
{
"section": 177,
"id": "177_25",
"transcription": "sɔnə",
"irish": "sona",
"french": "heureux"
},
{
"section": 177,
"id": "177_23",
"transcription": "krɔmʷɩmʹ",
"irish": "cromaim",
"french": "je courbe"
},
{
"section": 177,
"id": "177_24",
"transcription": "fɔnəvər",
"irish": "fonnmhar",
"french": "désireux"
},
]
modwith = ["lo̤mʷɩmʹ", "so̤nə", "kro̤mʷɩmʹ", "fo̤nəvər"]
section = 180
DATA[section] = []
for i in range(len(mods)):
mod = mods[i]
mod["section"] = section
mod["compare_id"] = mod["id"]
mod["id"] = f"{section}_{i + 1}"
mod["transcription"] = modwith[i]
DATA[section].append(mod)
START = DATA["261"][0:7]
REST = DATA["261"][7:]
def increment_id(item):
itemid = item["id"]
p = itemid.split("_")
item["id"] = f'{item["section"]}_{int(p[1])+1}'
return item
ADD = {
"section": 261,
"id": "261_8",
"transcription": "məˈlaχtu:",
"cf_section": 268
}
DATA["261"] = START + [ADD] + [increment_id(x) for x in REST]
holding = DATA[section][-8:]
s302 = []
for h in holding:
h["section"] = 302
h["id"] = h["id"].replace("301", "302")
s302.append(h)
DATA[section] = DATA[section][:-8]
section = 302
DATA[section] = s302