BuNaMo to json
Possibly incomplete; Kaggle version here
from lxml import etree
class BuNaMoWrongDocument(Exception):
"""Exception raised for wrong document type"""
def __init__(self, expected, got):
self.expected = expected
self.got = got
self.message = f"Expected root element <{self.expected}> but got <{self.got}>"
super().__init__(self.message)
Various functions to read one of the types of XML file. The open parts of speech (noun, adjective, verb) can have multiple forms, so those functions return attributes (a dictionary) and forms (a list of dictionaries) separately.
Close parts of speech (possessives and prepositions) are simpler, and most of the attributes are needless, so they return a simple dictionary containing the forms.
def read_adjective(file):
tree = etree.parse(file)
root = tree.getroot()
valid_tags = ['sgNom', 'sgGenMasc', 'sgGenFem', 'plNom', 'graded', 'abstractNoun', 'sgVocMasc', 'sgVocFem']
attribs = {}
forms = []
if root.tag != 'adjective':
raise BuNaMoWrongDocument('adjective', root.tag)
attribs['default'] = root.get('default')
attribs['declension'] = root.get('declension')
attribs['disambig'] = root.get('disambig')
attribs['isPre'] = root.get('isPre')
for child in root:
if child.tag not in valid_tags:
raise Exception('Unexpected tag ' + child.tag)
tmp = {}
tmp['props'] = child.tag
tmp['form'] = child.get('default')
forms.append(tmp)
return attribs, forms
def read_noun(file):
tree = etree.parse(file)
root = tree.getroot()
valid_tags = ['sgNom', 'sgGen', 'plNom', 'plGen', 'count', 'sgDat']
attribs = {}
forms = []
if root.tag != 'noun':
raise BuNaMoWrongDocument('noun', root.tag)
attribs['default'] = root.get('default')
attribs['declension'] = root.get('declension')
attribs['disambig'] = root.get('disambig')
attribs['isProper'] = root.get('isProper')
attribs['isDefinite'] = root.get('isDefinite')
attribs['allowArticledGenitive'] = root.get('allowArticledGenitive')
for child in root:
if child.tag not in valid_tags:
raise Exception('Unexpected tag ' + child.tag)
tmp = {}
tmp['props'] = child.tag
tmp['form'] = child.get('default')
tmp['gender'] = child.get('gender')
tmp['strength'] = child.get('strength')
forms.append(tmp)
return attribs, forms
def read_verb(file):
tree = etree.parse(file)
root = tree.getroot()
valid_tags = ['verbalNoun', 'verbalAdjective', 'tenseForm', 'moodForm']
attribs = {}
forms = []
if root.tag != 'verb':
raise BuNaMoWrongDocument('verb', root.tag)
attribs['default'] = root.get('default')
attribs['disambig'] = root.get('disambig')
for child in root:
if child.tag not in valid_tags:
raise Exception('Unexpected tag ' + child.tag)
tmp = {}
tmp['props'] = child.tag
tmp['form'] = child.get('default')
tmp['tense'] = child.get('tense')
tmp['mood'] = child.get('mood')
tmp['dependency'] = child.get('dependency')
tmp['person'] = child.get('person')
forms.append(tmp)
return attribs, forms
def read_nounphrase(file):
tree = etree.parse(file)
root = tree.getroot()
valid_tags = ['sgNom', 'sgGen', 'plNom', 'plGen', 'sgNomArt', 'sgGenArt', 'plNomArt', 'plGenArt']
attribs = {}
forms = []
if root.tag != 'nounPhrase':
raise BuNaMoWrongDocument('nounPhrase', root.tag)
attribs['default'] = root.get('default')
attribs['declension'] = root.get('declension')
attribs['disambig'] = root.get('disambig')
attribs['isProper'] = root.get('isProper')
attribs['isDefinite'] = root.get('isDefinite')
attribs['allowArticledGenitive'] = root.get('allowArticledGenitive')
attribs['forceNominative'] = root.get('forceNominative')
for child in root:
if child.tag not in valid_tags:
raise Exception('Unexpected tag ' + child.tag)
tmp = {}
tmp['props'] = child.tag
tmp['form'] = child.get('default')
tmp['gender'] = child.get('gender')
tmp['strength'] = child.get('strength')
forms.append(tmp)
return attribs, forms
def read_possessive(file):
tree = etree.parse(file)
root = tree.getroot()
valid_tags = ['full', 'apos']
attribs = {}
forms = []
if root.tag != 'possessive':
raise BuNaMoWrongDocument('possessive', root.tag)
attribs['default'] = root.get('default')
attribs['disambig'] = root.get('disambig')
attribs['mutation'] = root.get('mutation')
for child in root:
if child.tag not in valid_tags:
raise Exception('Unexpected tag ' + child.tag)
if child.tag == 'apos':
attribs['apos'] = child.get('default')
return attribs
def read_preposition(file):
tree = etree.parse(file)
root = tree.getroot()
valid_tags = ['sg1', 'sg2', 'sg3Masc', 'sg3Fem', 'pl1', 'pl2', 'pl3']
attribs = {}
forms = []
if root.tag != 'preposition':
raise BuNaMoWrongDocument('preposition', root.tag)
attribs['default'] = root.get('default')
for child in root:
if child.tag not in valid_tags:
raise Exception('Unexpected tag ' + child.tag)
attribs[child.tag] = child.get('default')
return attribs
import glob
import json
adjectives = {}
for x in glob.glob('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/adjective/*.xml'):
fname = x.split('/')[-1].replace('.xml', '')
attribs, forms = read_adjective(x)
tmp = {}
tmp['attributes'] = attribs
tmp['forms'] = forms
adjectives[fname] = tmp
word = attribs['default']
with open('adjectives.json', 'w') as outfile:
json.dump(adjectives, outfile)
nouns = {}
for x in glob.glob('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/noun/*.xml'):
fname = x.split('/')[-1].replace('.xml', '')
attribs, forms = read_noun(x)
tmp = {}
tmp['attributes'] = attribs
tmp['forms'] = forms
nouns[fname] = tmp
word = attribs['default']
with open('nouns.json', 'w') as outfile:
json.dump(nouns, outfile)
nounphrases = {}
for x in glob.glob('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/nounPhrase/*.xml'):
fname = x.split('/')[-1].replace('.xml', '')
attribs, forms = read_nounphrase(x)
tmp = {}
tmp['attributes'] = attribs
tmp['forms'] = forms
nounphrases[fname] = tmp
word = attribs['default']
with open('nounphrases.json', 'w') as outfile:
json.dump(nounphrases, outfile)
verbs = {}
for x in glob.glob('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/verb/*.xml'):
fname = x.split('/')[-1].replace('.xml', '')
attribs, forms = read_verb(x)
tmp = {}
tmp['attributes'] = attribs
tmp['forms'] = forms
verbs[fname] = tmp
word = attribs['default']
with open('verbs.json', 'w') as outfile:
json.dump(verbs, outfile)
preposition = {}
for x in glob.glob('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/preposition/*.xml'):
fname = x.split('/')[-1].replace('.xml', '')
attribs = read_preposition(x)
tmp = {}
tmp['attributes'] = attribs
preposition[fname] = tmp
with open('prepositions.json', 'w') as outfile:
json.dump(preposition, outfile)
possessive = {}
for x in glob.glob('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/possessive/*.xml'):
fname = x.split('/')[-1].replace('.xml', '')
attribs = read_possessive(x)
tmp = {}
tmp['attributes'] = attribs
possessive[fname] = tmp
with open('possessives.json', 'w') as outfile:
json.dump(possessive, outfile)
possessive