Irish chunks from UD
Chunks from Universal Dependencies
!python -m pip install conllu
import conllu
sample = """
# sent_id = 909
# text = M'anam go raibh sin iontach scáfar, ach ansin d'fhág tú sin agus chuaigh tú le sagartóireacht.
1 M' mo DET Det Number=Sing|Person=1|Poss=Yes 2 nmod:poss _ SpaceAfter=No
2 anam anam NOUN Noun Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing 0 root _ _
3 go go PART Vb PartType=Cmpl 4 mark:prt _ _
4 raibh bí VERB PastInd Mood=Ind|Tense=Past 2 csubj:cop _ _
5 sin sin PRON Dem PronType=Dem 4 nsubj _ _
6 iontach iontach ADV Its _ 7 advmod _ _
7 scáfar scáfar ADJ Adj Degree=Pos 4 xcomp:pred _ SpaceAfter=No
8 , , PUNCT Punct _ 9 punct _ _
9 ach ach SCONJ Subord _ 12 mark _ _
10 ansin ansin ADV Loc _ 12 advmod _ _
11 d' do PART Vb PartType=Vb 12 mark:prt _ SpaceAfter=No
12 fhág fág VERB VTI Form=Len|Mood=Ind|Tense=Past 4 advcl _ _
13 tú tú PRON Pers Number=Sing|Person=2 12 nsubj _ _
14 sin sin PRON Dem PronType=Dem 12 obj _ _
15 agus agus CCONJ Coord _ 16 cc _ _
16 chuaigh téigh VERB VTI Form=Len|Mood=Ind|Tense=Past 12 conj _ _
17 tú tú PRON Pers Number=Sing|Person=2 16 nsubj _ _
18 le le ADP Simp _ 19 case _ _
19 sagartóireacht sagartóireacht NOUN Noun Case=NomAcc|Gender=Fem|Number=Sing 16 obl _ SpaceAfter=No
20 . . PUNCT . _ 2 punct _ _
"""
from conllu.models import TokenList, Token
from conllu import parse
sent = parse(sample)
print(sent)
sentence = sent[0]
print(sentence)
sentence[0]
tokens = {t['id']: t for t in sentence}
_CHUNKMAP = {
'PROPN': 'NP',
'PRON': 'NP',
'NUM': 'NP',
'NOUN': 'NP',
'ADJ': 'ADJP',
'SCONJ': 'CONJ',
'CCONJ': 'CONJ',
}
numerical_symbol = ["$", "%", "£", "°", "€", "n°", "kw", "kg", "g", "km", "cm", "mm", "m", "m²", "cm²", "mm²", "km²"]
num_chunks = 0
for token in sentence:
if token['upos'] in ['PROPN', 'PRON', 'NUM']:
num_chunks += 1
token['chunk'] = f'NP-{num_chunks}'
elif token['upos'] == 'NOUN':
if 'case' in token['deprel']:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
else:
num_chunks += 1
token['chunk'] = f'NP-{num_chunks}'
elif token['upos'] == 'SYM':
if 'case' in token['deprel']:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
elif token['form'].lower() in numerical_symbol or '$' in token['form']:
num_chunks += 1
token['chunk'] = f'NP-{num_chunks}'
else:
token['chunk'] = None
elif token['upos'] == 'X':
if 'case' in token['deprel']:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
else:
token['chunk'] = None
elif token['upos'] in ['AUX', 'VERB']:
if 'case' in token['deprel']:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
else:
num_chunks += 1
token['chunk'] = f'VP-{num_chunks}'
elif token['upos'] == 'ADP':
if token['deprel'] in ["case", "mark", "compound:prt"]:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
else:
token['chunk'] = None
elif token['upos'] == 'PART':
if 'case' in token['deprel']:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
else:
token['chunk'] = None
elif token['upos'] == 'ADV':
if 'case' in token['deprel']:
num_chunks += 1
token['chunk'] = f'PP-{num_chunks}'
else:
num_chunks += 1
token['chunk'] = f'ADVP-{num_chunks}'
elif token['upos'] == 'DET':
if 'advmod' in token['deprel']:
num_chunks += 1
token['chunk'] = f'ADVP-{num_chunks}'
else:
num_chunks += 1
token['chunk'] = None
num_chunks
for token in sentence:
if 'chunk' in token:
print(token, token['chunk'])