Parse Swedish gigaword XML
Dataset
example = """\
<corpus id="1960-0000">
<text date="1965-02-14" datefrom="19650214" dateto="19650214" genre="news" publisher="Stockholms Tidningen " timefrom="000000" timeto="235959" topic="Politik och samhällsfrågor" year="1965">
<sentence id="aa9c2ac8-ae5dd1a1">
<w dephead="4" deprel="RA" lemma="|i|" lex="|i..pp.1|" msd="PP" pos="PP" prefix="|" ref="1" saldo="|i..2|" suffix="|">I</w>
<w dephead="3" deprel="DT" lemma="|" lex="|" msd="HD.UTR.SIN.IND" pos="HD" prefix="|" ref="2" saldo="|" suffix="|">vilken</w>
</sentence>
</text>
</corpus>
"""
import xml.etree.ElementTree as ET
def _attrib(node, attrib: str) -> str:
if attrib in node.attrib:
return node.attrib[attrib].strip()
else:
return ""
def _iattrib(node, attrib: str) -> str:
if attrib in node.attrib:
try:
return int(node.attrib[attrib].strip())
except ValueError:
return 0
else:
return 0
class Corpus:
def __init__(self, source):
tree = ET.parse(source)
root = tree.getroot()
self.id = _attrib(root, 'id')
self.texts = []
for text_node in root.findall('./text'):
self.texts.append(Text(text_node))
class Text:
def __init__(self, node):
self.date = _attrib(node, 'date')
self.datefrom = _iattrib(node, 'datefrom')
self.dateto = _iattrib(node, 'dateto')
self.genre = _attrib(node, 'genre')
self.publisher = _attrib(node, 'publisher')
self.timefrom = _iattrib(node, 'timefrom')
self.timeto = _iattrib(node, 'timeto')
self.topic = _attrib(node, 'topic')
self.year = _iattrib(node, 'year')
self.sentences = []
for sent_node in node.findall('./sentence'):
self.sentences.append(Sentence(sent_node))
class Sentence:
def __init__(self, node):
self.id = _attrib(node, 'id')
self.words = []
for w_node in node.findall('./w'):
self.words.append(Word(w_node))
class Word:
def __init__(self, node):
self.dephead = _attrib(node, 'dephead')
self.deprel = _attrib(node, 'deprel')
self.lemma = _attrib(node, 'lemma')
self.lex = _attrib(node, 'lex')
self.msd = _attrib(node, 'msd')
self.pos = _attrib(node, 'pos')
self.prefix = _attrib(node, 'prefix')
self.ref = _attrib(node, 'ref')
self.saldo = _attrib(node, 'saldo')
self.suffix = _attrib(node, 'suffix')
self.word = node.text.strip()
import io
sio = io.StringIO(example)
corp = Corpus(sio)
import json
json.dumps(corp, default=lambda o: o.__dict__)