Sweachum reader
Incomplete corpus reader
SAMPLE = """
<corpus id="sweachum">
<text datefrom="20120101" dateto="20121231" timefrom="000000" timeto="235959" lix="55.44" ovix="65.03" nk="2.01" subject="Filosofi" type="PhD" date="2012">
<sentence id="b60ceaf85-b604d04ed" _geocontext="|">
<ne ex="ENAMEX" type="PRS" subtype="HUM" name="Marton">
<w pos="PM" msd="PM.NOM" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="01" deprel="ROOT">Marton</w>
</ne>
<w pos="MID" msd="MID" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="02" dephead="01" deprel="IK">,</w>
<w pos="PM" msd="PM.NOM" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="03" dephead="01" deprel="ET">F.</w>
<w pos="KN" msd="KN.AN" lemma="|&|" lex="|o..kna.2|" sense="|och..1:-1.000|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="04" dephead="03" deprel="HD">&</w>
<w pos="NN" msd="NN.UTR.SIN.IND.NOM" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="05" dephead="03" deprel="HD">amp</w>
<w pos="MID" msd="MID" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="06" dephead="01" deprel="IS">;</w>
<w pos="PM" msd="PM.NOM" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="07" dephead="01" deprel="AN">Booth</w>
<w pos="MID" msd="MID" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="08" dephead="07" deprel="IK">,</w>
<w pos="PM" msd="PM.NOM" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="09" dephead="07" deprel="ET">S</w>
<w pos="MAD" msd="MAD" lemma="|" lex="|" sense="|" prefix="|" suffix="|" compwf="|" complemgram="|" ref="10" dephead="01" deprel="IP">.</w>
</sentence>
</text>
</corpus>
"""
Corpus is here, CC BY 4.0
import xml.etree.ElementTree as ET
import xml.sax.saxutils as saxutils
import io
source = io.StringIO(SAMPLE)
tree = ET.parse(source)
root = tree.getroot()
words = []
for word in root.findall('.//w'):
words.append(word.text.strip())
words
def _clean_amps(inlist):
htmlamp = ['&', 'amp', ';']
outlist = []
i = 0
while i < len(inlist):
if inlist[i:i+3] == htmlamp:
outlist.append('&')
i += 3
continue
else:
outlist.append(inlist[i])
i += 1
return outlist
_clean_amps(words)
def _get_or_blank(text):
if text == "|":
return ""
if text[0:1] == "|" and text[-1:] == "|":
text = text[1:-1]
return text
class Word():
def __init__(self, text, pos, msd, lemma, lex, sense, prefix, suffix, compwf, complemgram, ref, dephead, deprel):
self.text = text
self.pos = _get_or_blank(pos)
self.msd = _get_or_blank(msd)
self.lex = _get_or_blank(lex)