The text is on Wikisource

This is a really bad example of pretty much everything, but it covers the most common cases, and the extraction needed to be manually checked anyway.

FORM_MAPS = {
    "le génitif": "genitive",
    "gen.": "genitive",
    "gén.": "genitive",
    "gén": "genitive",
    "dat.": "dative",
    "dat": "dative",
    "plur.": "plural",
    "plur": "plural",
    "futur": "future",
    "fut.": "future",
    "participe": "participle",
    "comp.": "comparative",
    "comp": "comparative",
    "compar.": "comparative",
    "compar": "comparative",
    "impér.": "imperative",
    "impér": "imperative",
    "diminutif": "diminutive",
    "vocatif": "vocative",
    "gén. fém.": "genitive_feminine",
    "superl.": "superlative",
    "part.": "participle"
}

import json
with open("sjoestedt_phonetique.json") as inf:
    DATA = json.load(inf)

section = 30

if not section in DATA:
    DATA[section] = []

DATA[section] = []

import re
REGEX_CASE = rf"^([^(]+) \(([^)]+)\) « ([^»]+) », ({'|'.join(FORM_MAPS.keys())}) ([^(]+) \(([^)]+)\)$"
REGEX_CASE_OF = rf"^([^(]+) \(([^)]+)\),? ({'|'.join(FORM_MAPS.keys())}) de ([^«]+) « ([^»]+) »$"
REGEX_CASE_OF_BOTH = rf"^([^(]+) \(([^)]+)\),? ({'|'.join(FORM_MAPS.keys())}) de ([^«]+) \(([^)]+)\) « ([^»]+) »$"
REGEX_BASE = r"^([^(]+) \(([^)]+)\) « ([^»]+) »$"
REGEX_JUST_FRENCH = r"^([^(]+) « ([^»]+) »$"
REGEX_FROM = r"^([^(]+) \(([^)]+)\) « ([^»]+) »,? de ([^(]+) \(([^)]+)\) « ([^»]+) »$"
REGEX_IN = r"^([^(]+) \(([^)]+)\), dans ([^«]+) « ([^»]+) »$"
REGEX_OR = r"^([^(]+) \(([^)]+)\) ou ([^(]+) \(([^)]+)\) « ([^»]+) »$"

def extract(text, splitter, counter):
    text = text.replace("\u00ad", "").replace("\n", " ")
    for p in text.split(splitter):
            m = re.match(REGEX_BASE, p.strip())
            mjf = re.match(REGEX_JUST_FRENCH, p.strip())
            mf = re.match(REGEX_FROM, p.strip())
            min = re.match(REGEX_IN, p.strip())
            mc = re.match(REGEX_CASE, p.strip())
            mcofb = re.match(REGEX_CASE_OF_BOTH, p.strip())
            mcof = re.match(REGEX_CASE_OF, p.strip())
            mor = re.match(REGEX_OR, p.strip())
            if m:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": m.group(1),
                    "irish": m.group(2),
                    "french": m.group(3),
                })
                counter += 1
            elif mjf:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mjf.group(1),
                    "french": mjf.group(2),
                })
                counter += 1
            elif mf:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mf.group(1),
                    "irish": mf.group(2),
                    "french": mf.group(3).replace("\xad", ""),
                    "from": mf.group(5)
                })
                counter += 1
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mf.group(4),
                    "irish": mf.group(5),
                    "french": mf.group(6).replace("\xad", ""),
                })
                counter += 1
            elif mor:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mor.group(1),
                    "irish": mor.group(2),
                    "french": mor.group(5),
                    "alt": mor.group(4)
                })
                counter += 1
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mor.group(3),
                    "irish": mor.group(4),
                    "french": mor.group(5),
                    "alt": mor.group(2)
                })
                counter += 1
            elif min:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": min.group(1),
                    "irish": min.group(2),
                    "in": min.group(3)
                })
                counter += 1
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": min.group(3),
                    "french": min.group(4),
                })
                counter += 1
            elif mc:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mc.group(1),
                    "irish": mc.group(2),
                    "french": mc.group(3),
                    FORM_MAPS[mc.group(4)]: mc.group(6)
                })
                counter += 1
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mc.group(5),
                    "irish": mc.group(6),
                    f"{FORM_MAPS[mc.group(4)]}_of": mc.group(2)
                })
                counter += 1
            elif mcofb:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mcofb.group(1),
                    "irish": mcofb.group(2),
                    f"{FORM_MAPS[mcofb.group(3)]}_of": mcofb.group(5)
                })
                counter += 1
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "irish": mcofb.group(5),
                    "transcription": mcofb.group(4),
                    "french": mcofb.group(6),
                    FORM_MAPS[mcofb.group(3)]: mcofb.group(2)
                })
                counter += 1
            elif mcof:
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mcof.group(1),
                    "irish": mcof.group(2),
                    f"{FORM_MAPS[mcof.group(3)]}_of": mcof.group(4)
                })
                counter += 1
                DATA[section].append({
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": mcof.group(4),
                    "french": mcof.group(5),
                    FORM_MAPS[mcof.group(3)]: mcof.group(2)
                })
                counter += 1
            else:
                print(p)
    print(counter)

REST = {
    # 302: "go rᴇ̈: mah əgɑt (go raibh maith agat) « merci »",
    321: "vʹi: klᴀᴜn nə gᴜ̃:rsən ɛg i·əsg̬əχ ⸢ɛr fʹαg⸣ ⸤ǥɑ: lᴇ̈:⸥ (bhí clann na gcomh­arsan ag iascach ar feadh dhá lae) « les fils des voisins ont été à la pêche pendant deux jours »; tɑ: nə prɑ:tɪ: go ⸢hɔlk⸣ (ta na prataí go holc) « les pommes de terre sont mauvaises » ; vʷɪlʹ ᴇ̈ᵊn ᴜ:rɑ̃:n ə nᴇ̈:ᵊχʌr ⸤əgɑt⸥ (an bhfuil aon amhrán i n-aon chor agat ?) « est-ce que tu sais chanter ? » ; fʹì:aχ ⸤ə vʷɪlʹ⸥ ᴇ̈ᵊn fʹiŋʹgʹɩnʹ ⸢ɩkʹi⸣ ⸤ɩ çαno:χ⸥ ⸢pʲaᴜn⸣ ⸤dɔm-sə⸥ (feuch an bhfuil aon phinginn aici a cheannóchadh peann dom-sa) « regarde si elle n’a pas un penny, avec lequel je pourrais m’acheter une plume »"
}

section = 321
if not section in DATA:
    DATA[section] = []
text = REST[section].replace(" : ", " ; ")
text = text.replace("\u00ad", "")
text = text.replace("»", " »")
text = text.replace("  »", " »")
text = text.replace(", mais", " ; ")
extract(text, ";", 1)

5

def split_trans(pos):
    if " ou " in DATA[section][pos]["transcription"]:
        DATA[section][pos]["transcription"] = DATA[section][pos]["transcription"].split(" ou ")
    elif " et " in DATA[section][pos]["transcription"]:
        DATA[section][pos]["transcription"] = DATA[section][pos]["transcription"].split(" et ")
    elif " à côté de " in DATA[section][pos]["transcription"]:
        DATA[section][pos]["transcription"] = DATA[section][pos]["transcription"].split(" à côté de ")

for s in range(len(DATA[section])):
    if "transcription" in DATA[section][s]:
        split_trans(s)

import json

if str(section) in DATA and section in DATA:
    tmp = DATA[str(section)] + DATA[section]
    DATA[str(section)] = tmp
    del(DATA[section])

with open("sjoestedt_phonetique.json", "w") as outf:
    json.dump(DATA, outf)

def french_to_note(pos):
    DATA[section][pos]["french_note"] = DATA[section][pos]["french"]
    del(DATA[section][pos]["french"])

def contrast(pos, pos1):
    if 'irish' in DATA[section][pos1]:
        DATA[section][pos]['contrast'] = DATA[section][pos1]['irish']
    else:
        DATA[section][pos]['contrast'] = DATA[section][pos1]['transcription']
    DATA[section][pos]['contrast_id'] = DATA[section][pos1]['id']

def contrast_next(pos):
    contrast(pos, pos+1)

def syllabified(item):
    if type(item["transcription"]) == str:
        if "-" in item["transcription"]:
            item["syllabified"] = item["transcription"]
            item["transcription"] = item["transcription"].replace("-", "")
    else:
        if "-" in item["transcription"][0]:
            item["syllabified"] = item["transcription"]
            item["transcription"] = [x.replace("-", "") for x in item["transcription"]]

def syllabify():
    for s in DATA[section]:
        syllabified(s)

def intonation(item):
    if "⸢" in item["transcription"] or "⸤" in item["transcription"]:
        item["intonation"] = item["transcription"]
        item["transcription"] = item["transcription"].replace("⸢", "").replace("⸣", "").replace("⸤", "").replace("⸥", "")

def intone():
    for s in DATA[section]:
        intonation(s)

# DATA[section][0]["transcription"] = "vʹi: klᴀᴜ(n) nə gᴜ̃:rsən ɛg i·əsg̬əχ ⸢ɛr fʹα(g)⸣ ⸤ǥɑ: lᴇ̈:⸥"
intone()


# syllabify()
# contrast(8, 10)
# DATA[264][0]["transcription"] = DATA[264][0]["transcription"].split(", ")
# contrast_next(-3)
# DATA[section].append(
#     {
#       "section": 301,
#       "id": "301_2",
#       "transcription": "gʷᴇ̈:h",
#     }
# )
# DATA[section][-1]["for"] = "kᴜ̃:nəv"
# DATA[section][-2]["from"] = [DATA[section][-2]["from"], "deallramh"]
# DATA[section][-2]["transcription"] = "ən lɑ̃:v ə ʃi:nʹtər ə ka:ⁱnʹtər"
# del(DATA[section][5]['french'])

# DATA[section][-1]["transcription"] = "ˈʃαχ(t) ˌno:rʃɩ"
# contrast_next(-2)

mods = [
    {
      "section": 17,
      "id": "17_23",
      "irish": "lomaim",
      "transcription": "lɔmʷɩmʹ",
      "french": "je me dépouille, je deviens chauve",
      "future": "lomfad"
    },
    {
      "section": 177,
      "id": "177_25",
      "transcription": "sɔnə",
      "irish": "sona",
      "french": "heureux"
    },
    {
      "section": 177,
      "id": "177_23",
      "transcription": "krɔmʷɩmʹ",
      "irish": "cromaim",
      "french": "je courbe"
    },
   {
      "section": 177,
      "id": "177_24",
      "transcription": "fɔnəvər",
      "irish": "fonnmhar",
      "french": "désireux"
    },
]
modwith = ["lo̤mʷɩmʹ", "so̤nə", "kro̤mʷɩmʹ", "fo̤nəvər"]

section = 180
DATA[section] = []
for i in range(len(mods)):
    mod = mods[i]
    mod["section"] = section
    mod["compare_id"] = mod["id"]
    mod["id"] = f"{section}_{i + 1}"
    mod["transcription"] = modwith[i]
    DATA[section].append(mod)

START = DATA["261"][0:7]
REST = DATA["261"][7:]

def increment_id(item):
    itemid = item["id"]
    p = itemid.split("_")
    item["id"] = f'{item["section"]}_{int(p[1])+1}'
    return item

ADD = {
      "section": 261,
      "id": "261_8",
      "transcription": "məˈlaχtu:",
      "cf_section": 268
}

DATA["261"] = START + [ADD] + [increment_id(x) for x in REST]

holding = DATA[section][-8:]
s302 = []
for h in holding:
    h["section"] = 302
    h["id"] = h["id"].replace("301", "302")
    s302.append(h)
DATA[section] = DATA[section][:-8]
section = 302
DATA[section] = s302