Start page

DATA = {}

SOURCES = {
    "Atk.": "atkinsons",
    "Cl. S.": "claidheamh_soluis",
    "D. P.": "derry_people",
    "Di.": "dinneen",
    "Diss.": "die_lautliche_geltung",
    "Finck": "finck",
    "G. J.": "gaelic_journal",
    "Henebry": "henebry",
    "Hogan": "hogan",
    "Macbain": "macbain",
    "Meyer": "meyer",
    "Molloy": "molloy",
    "Pedersen": "pedersen",
    "Rhys": "rhys",
    "Sg. Fearn.": "sgeulaidhe_fearnmhuighe",
    "Spir. Rose": "spiritual_rose",
    "O’R.": "oreilly",
    "Wi.": "windisch",
    "O.Ir. acc. pl.": "old_irish_accusative_plural",

    "M.Ir.": "middle_irish",
    "O.Ir.": "old_irish",
}

DATA

{490: [{'section': 490,
   'id': '490_1',
   'transcription': 'glαk gə ·sɔkyr′ ə',
   'english': 'take it easy'},
  {'section': 490,
   'id': '490_2',
   'transcription': 'Nα kyr′ kɔ ·t′Uw̥ iəd',
   'english': 'do not set them so close'}]}

section = 1

PAGE = """
§ 3. This sound frequently represents O.Ir. a in accented syllables before non-palatal con­sonants, e.g. αrəm, ‘army’, O.Ir. arm; αt, ‘swelling’, O.Ir. att; fαnαχt ‘to stay, remain’, O.Ir. anaim; kαpəL, ‘mare’, M.Ir. capall; mαk, ‘son’, O.Ir. macc; mαLαχt, ‘curse’, O.Ir. maldacht; tαχtuw, ‘to choke’, O.Ir. tachtad; tαrt, ‘thirst’, O.Ir. tart; tαruw, ‘bull’, M.Ir. tarb.

§ 4. O.Ir. e before non-palatal con­sonants in accented syllables usually gives α, e.g. αχ, ‘steed’, O.Ir. ech; αlə, ‘swan’, M.Ir. ela; αŋ, ‘splice, strip’; αŋαχ, ‘fisherman’s net’, M.Ir. eng; dʹrʹαm, ‘crowd’, M.Ir. dremm; dʹαrəg, ‘red’, O.Ir. derg; fʹαr, ‘man’, O.Ir. fer; gʹαl, ‘white’, M.Ir. gel; kʹαχtər, ‘either’, O.Ir. cechtar; Lʹαnuw, ‘child’, M.Ir. lenab; Nʹαd, ‘nest’, M.Ir. net; pʹαkuw, ‘sin’, O.Ir. peccad; ʃαsuw, ‘to stand’, M.Ir. sessom; tʹαχ, ‘house’, O.Ir. tech.
"""
PAGE_NUM = 5

page_lines = [x for x in PAGE.replace("\u00ad", "").split("\n") if x != ""]

DATA = {}

def extend_trans(item, trans):
    if not "transcription" in item:
        item["transcription"] = trans
    item["transcription"] = [item["transcription"], trans]

import re

_BASIC = r"^([^‘]+), ‘([^’]+)’"
BASIC = re.compile(_BASIC)

def get_basic(text):
    m = BASIC.match(text)
    if not m:
        return ("", "", text)
    t = m.group(1).strip()
    e = m.group(2).strip()
    m_end = m.span()[1]
    if m_end == len(text):
        return (t, e, "")
    text = text[m_end:]
    if text.startswith(","):
        text = text[1:].strip()
    return (t, e, text)

def get_sources(text):
    a = list(SOURCES.keys())
    a.sort(key=len, reverse=True)
    compare = False
    if text.startswith("cp."):
        compare = True
        text = text[3:].strip()
    for s in a:
        if text.startswith(s):
            item = SOURCES[s]

NoneType

for line in page_lines:
    counter = 1
    if line.startswith("§ "):
        dot = line.find(". ")
        pn = line[2:dot]
        try:
            section = int(pn)
        except:
            continue
        if not section in DATA:
            DATA[section] = []
        current = {}
        if "e.g." in line:
            linep = line.split("e.g.")
            if len(linep) != 2:
                print(line)
                
            parts = [x.strip() for x in linep[1].split(";")]
            for part in parts:
                if part.endswith("."):
                    part = part[:-1]
                t, e, r = get_basic(part)
                current = {
                    "page": PAGE_NUM,
                    "section": section,
                    "id": f"{section}_{counter}",
                    "transcription": t,
                    "english": e
                }

                DATA[section].append(current)
                counter += 1

<re.Match object; span=(0, 12), match='αrəm, ‘army’'>
<re.Match object; span=(0, 14), match='αt, ‘swelling’'>
None
<re.Match object; span=(0, 13), match='kαpəL, ‘mare’'>
<re.Match object; span=(0, 10), match='mαk, ‘son’'>
<re.Match object; span=(0, 15), match='mαLαχt, ‘curse’'>
<re.Match object; span=(0, 18), match='tαχtuw, ‘to choke’'>
<re.Match object; span=(0, 14), match='tαrt, ‘thirst’'>
<re.Match object; span=(0, 13), match='tαruw, ‘bull’'>
<re.Match object; span=(0, 11), match='αχ, ‘steed’'>
<re.Match object; span=(0, 11), match='αlə, ‘swan’'>
<re.Match object; span=(0, 19), match='αŋ, ‘splice, strip’'>
<re.Match object; span=(0, 17), match='αŋαχ, ‘fisherman’'>
<re.Match object; span=(0, 15), match='dʹrʹαm, ‘crowd’'>
<re.Match object; span=(0, 13), match='dʹαrəg, ‘red’'>
<re.Match object; span=(0, 11), match='fʹαr, ‘man’'>
<re.Match object; span=(0, 13), match='gʹαl, ‘white’'>
<re.Match object; span=(0, 17), match='kʹαχtər, ‘either’'>
<re.Match object; span=(0, 15), match='Lʹαnuw, ‘child’'>
<re.Match object; span=(0, 12), match='Nʹαd, ‘nest’'>
<re.Match object; span=(0, 13), match='pʹαkuw, ‘sin’'>
<re.Match object; span=(0, 17), match='ʃαsuw, ‘to stand’'>
<re.Match object; span=(0, 13), match='tʹαχ, ‘house’'>

part[m.span()[1]:]

', O.Ir. tech'