!python -m pip install conllu
Collecting conllu
  Using cached conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3
import conllu
sample = """
# sent_id = 909
# text = M'anam go raibh sin iontach scáfar, ach ansin d'fhág tú sin agus chuaigh tú le sagartóireacht.
1	M'	mo	DET	Det	Number=Sing|Person=1|Poss=Yes	2	nmod:poss	_	SpaceAfter=No
2	anam	anam	NOUN	Noun	Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing	0	root	_	_
3	go	go	PART	Vb	PartType=Cmpl	4	mark:prt	_	_
4	raibh	bí	VERB	PastInd	Mood=Ind|Tense=Past	2	csubj:cop	_	_
5	sin	sin	PRON	Dem	PronType=Dem	4	nsubj	_	_
6	iontach	iontach	ADV	Its	_	7	advmod	_	_
7	scáfar	scáfar	ADJ	Adj	Degree=Pos	4	xcomp:pred	_	SpaceAfter=No
8	,	,	PUNCT	Punct	_	9	punct	_	_
9	ach	ach	SCONJ	Subord	_	12	mark	_	_
10	ansin	ansin	ADV	Loc	_	12	advmod	_	_
11	d'	do	PART	Vb	PartType=Vb	12	mark:prt	_	SpaceAfter=No
12	fhág	fág	VERB	VTI	Form=Len|Mood=Ind|Tense=Past	4	advcl	_	_
13	tú	tú	PRON	Pers	Number=Sing|Person=2	12	nsubj	_	_
14	sin	sin	PRON	Dem	PronType=Dem	12	obj	_	_
15	agus	agus	CCONJ	Coord	_	16	cc	_	_
16	chuaigh	téigh	VERB	VTI	Form=Len|Mood=Ind|Tense=Past	12	conj	_	_
17	tú	tú	PRON	Pers	Number=Sing|Person=2	16	nsubj	_	_
18	le	le	ADP	Simp	_	19	case	_	_
19	sagartóireacht	sagartóireacht	NOUN	Noun	Case=NomAcc|Gender=Fem|Number=Sing	16	obl	_	SpaceAfter=No
20	.	.	PUNCT	.	_	2	punct	_	_
"""
from conllu.models import TokenList, Token
from conllu import parse
sent = parse(sample)
print(sent)
[TokenList<M', anam, go, raibh, sin, iontach, scáfar, ,, ach, ansin, d', fhág, tú, sin, agus, chuaigh, tú, le, sagartóireacht, ., metadata={sent_id: "909", text: "M'anam go raibh sin iontach scáfar, ach ansin d'fhág tú sin agus chuaigh tú le sagartóireacht."}>]
sentence = sent[0]
print(sentence)
TokenList<M', anam, go, raibh, sin, iontach, scáfar, ,, ach, ansin, d', fhág, tú, sin, agus, chuaigh, tú, le, sagartóireacht, ., metadata={sent_id: "909", text: "M'anam go raibh sin iontach scáfar, ach ansin d'fhág tú sin agus chuaigh tú le sagartóireacht."}>
sentence[0]
{'id': 1,
 'form': "M'",
 'lemma': 'mo',
 'upos': 'DET',
 'xpos': 'Det',
 'feats': {'Number': 'Sing', 'Person': '1', 'Poss': 'Yes'},
 'head': 2,
 'deprel': 'nmod:poss',
 'deps': None,
 'misc': {'SpaceAfter': 'No'},
 'chunk': None}
tokens = {t['id']: t for t in sentence}
_CHUNKMAP = {
    'PROPN': 'NP',
    'PRON': 'NP',
    'NUM': 'NP',
    'NOUN': 'NP',
    'ADJ': 'ADJP',
    'SCONJ': 'CONJ',
    'CCONJ': 'CONJ',
}
numerical_symbol = ["$", "%", "£", "°", "€", "n°", "kw", "kg", "g", "km", "cm", "mm", "m", "m²", "cm²", "mm²", "km²"]
num_chunks = 0
for token in sentence:
    if token['upos'] in ['PROPN', 'PRON', 'NUM']:
        num_chunks += 1
        token['chunk'] = f'NP-{num_chunks}'
    elif token['upos'] == 'NOUN':
        if 'case' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        else:
            num_chunks += 1
            token['chunk'] = f'NP-{num_chunks}'
    elif token['upos'] == 'SYM':
        if 'case' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        elif token['form'].lower() in numerical_symbol or '$' in token['form']:
            num_chunks += 1
            token['chunk'] = f'NP-{num_chunks}'
        else:
            token['chunk'] = None
    elif token['upos'] == 'X':
        if 'case' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        else:
            token['chunk'] = None
    elif token['upos'] in ['AUX', 'VERB']:
        if 'case' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        else:
            num_chunks += 1
            token['chunk'] = f'VP-{num_chunks}'
    elif token['upos'] == 'ADP':
        if token['deprel'] in ["case", "mark", "compound:prt"]:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        else:
            token['chunk'] = None
    elif token['upos'] == 'PART':
        if 'case' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        else:
            token['chunk'] = None
    elif token['upos'] == 'ADV':
        if 'case' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'PP-{num_chunks}'
        else:
            num_chunks += 1
            token['chunk'] = f'ADVP-{num_chunks}'
    elif token['upos'] == 'DET':
        if 'advmod' in token['deprel']:
            num_chunks += 1
            token['chunk'] = f'ADVP-{num_chunks}'
        else:
            num_chunks += 1
            token['chunk'] = None
num_chunks
13
for token in sentence:
    if 'chunk' in token:
        print(token, token['chunk'])
M' None
anam NP-2
go None
raibh VP-3
sin NP-4
iontach ADVP-5
ansin ADVP-6
d' None
fhág VP-7
tú NP-8
sin NP-9
chuaigh VP-10
tú NP-11
le PP-12
sagartóireacht NP-13