More playing with spaCy
Using Stanza, some steps towards Swedish, some Irish
• 33 min read
%pip install spacy_stanza
import stanza
import spacy_stanza
stanza.download("en")
nlp = spacy_stanza.load_pipeline("en")
doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
print(doc.ents)
stanza.download("sv")
nlp_sv = spacy_stanza.load_pipeline("sv")
SENTS = [
"dem som går igenom en subventionerad anställning",
"dem som har haft en subventionerad anställning",
#H101UbU5_0028
"tidsanvändning så framkommer",
"tidsanvändning framkommer",
"arbetsuppgifter har tats bort",
"arbetsuppgifter tagits bort",
"administrativa uppgifterna har eller har ökat",
"administrativa uppgifterna ökat",
# more fun:
"Och det tror jag inte det råder någon oenighet om här.",
"Och jag tror inte att det här råder någon oenighet om det."
]
res = []
for sent in SENTS:
doc = nlp_sv(sent)
res.append(doc.to_json())
print(doc)
dem som går igenom en subventionerad anställning dem som har haft en subventionerad anställning tidsanvändning så framkommer tidsanvändning framkommer arbetsuppgifter har tats bort arbetsuppgifter tagits bort administrativa uppgifterna har eller har ökat administrativa uppgifterna ökat Och det tror jag inte det råder någon oenighet om här. Och jag tror inte att det här råder någon oenighet om det.
res
[{'text': 'dem som går igenom en subventionerad anställning', 'ents': [], 'sents': [{'start': 0, 'end': 48}], 'tokens': [{'id': 0, 'start': 0, 'end': 3, 'tag': 'PN|UTR/NEU|PLU|DEF|OBJ', 'pos': 'PRON', 'morph': 'Case=Acc|Definite=Def|Number=Plur|PronType=Prs', 'lemma': 'de', 'dep': 'root', 'head': 0}, {'id': 1, 'start': 4, 'end': 7, 'tag': 'HP|-|-|-', 'pos': 'PRON', 'morph': 'PronType=Rel', 'lemma': 'som', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 8, 'end': 11, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'gå', 'dep': 'acl:relcl', 'head': 0}, {'id': 3, 'start': 12, 'end': 18, 'tag': 'PL', 'pos': 'ADP', 'morph': '', 'lemma': 'igenom', 'dep': 'compound:prt', 'head': 2}, {'id': 4, 'start': 19, 'end': 21, 'tag': 'DT|UTR|SIN|IND', 'pos': 'DET', 'morph': 'Definite=Ind|Gender=Com|Number=Sing|PronType=Art', 'lemma': 'en', 'dep': 'det', 'head': 6}, {'id': 5, 'start': 22, 'end': 36, 'tag': 'PC|PRF|UTR|SIN|IND|NOM', 'pos': 'ADJ', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part', 'lemma': 'subventionerad', 'dep': 'amod', 'head': 6}, {'id': 6, 'start': 37, 'end': 48, 'tag': 'NN|UTR|SIN|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing', 'lemma': 'anställning', 'dep': 'obj', 'head': 2}]}, {'text': 'dem som har haft en subventionerad anställning', 'ents': [], 'sents': [{'start': 0, 'end': 46}], 'tokens': [{'id': 0, 'start': 0, 'end': 3, 'tag': 'PN|UTR/NEU|PLU|DEF|OBJ', 'pos': 'PRON', 'morph': 'Case=Acc|Definite=Def|Number=Plur|PronType=Prs', 'lemma': 'de', 'dep': 'root', 'head': 0}, {'id': 1, 'start': 4, 'end': 7, 'tag': 'HP|-|-|-', 'pos': 'PRON', 'morph': 'PronType=Rel', 'lemma': 'som', 'dep': 'nsubj', 'head': 3}, {'id': 2, 'start': 8, 'end': 11, 'tag': 'VB|PRS|AKT', 'pos': 'AUX', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'ha', 'dep': 'aux', 'head': 3}, {'id': 3, 'start': 12, 'end': 16, 'tag': 'VB|SUP|AKT', 'pos': 'VERB', 'morph': 'VerbForm=Sup|Voice=Act', 'lemma': 'ha', 'dep': 'acl:relcl', 'head': 0}, {'id': 4, 'start': 17, 'end': 19, 'tag': 'DT|UTR|SIN|IND', 'pos': 'DET', 'morph': 'Definite=Ind|Gender=Com|Number=Sing|PronType=Art', 'lemma': 'en', 'dep': 'det', 'head': 6}, {'id': 5, 'start': 20, 'end': 34, 'tag': 'PC|PRF|UTR|SIN|IND|NOM', 'pos': 'ADJ', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part', 'lemma': 'subventionerad', 'dep': 'amod', 'head': 6}, {'id': 6, 'start': 35, 'end': 46, 'tag': 'NN|UTR|SIN|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing', 'lemma': 'anställning', 'dep': 'obj', 'head': 3}]}, {'text': 'tidsanvändning så framkommer', 'ents': [], 'sents': [{'start': 0, 'end': 28}], 'tokens': [{'id': 0, 'start': 0, 'end': 14, 'tag': 'NN|UTR|SIN|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing', 'lemma': 'tidsanvändning', 'dep': 'nsubj', 'head': 2}, {'id': 1, 'start': 15, 'end': 17, 'tag': 'AB', 'pos': 'ADV', 'morph': '', 'lemma': 'så', 'dep': 'advmod', 'head': 2}, {'id': 2, 'start': 18, 'end': 28, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'framkomma', 'dep': 'root', 'head': 2}]}, {'text': 'tidsanvändning framkommer', 'ents': [], 'sents': [{'start': 0, 'end': 25}], 'tokens': [{'id': 0, 'start': 0, 'end': 14, 'tag': 'NN|UTR|SIN|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing', 'lemma': 'tidsanvändning', 'dep': 'nsubj', 'head': 1}, {'id': 1, 'start': 15, 'end': 25, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'framkomma', 'dep': 'root', 'head': 1}]}, {'text': 'arbetsuppgifter har tats bort', 'ents': [], 'sents': [{'start': 0, 'end': 29}], 'tokens': [{'id': 0, 'start': 0, 'end': 15, 'tag': 'NN|UTR|PLU|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Plur', 'lemma': 'arbetsuppgift', 'dep': 'nsubj:pass', 'head': 2}, {'id': 1, 'start': 16, 'end': 19, 'tag': 'VB|PRS|AKT', 'pos': 'AUX', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'ha', 'dep': 'aux', 'head': 2}, {'id': 2, 'start': 20, 'end': 24, 'tag': 'VB|SUP|SFO', 'pos': 'VERB', 'morph': 'VerbForm=Sup|Voice=Pass', 'lemma': 'ta', 'dep': 'root', 'head': 2}, {'id': 3, 'start': 25, 'end': 29, 'tag': 'PL', 'pos': 'ADV', 'morph': '', 'lemma': 'bort', 'dep': 'compound:prt', 'head': 2}]}, {'text': 'arbetsuppgifter tagits bort', 'ents': [], 'sents': [{'start': 0, 'end': 27}], 'tokens': [{'id': 0, 'start': 0, 'end': 15, 'tag': 'NN|UTR|PLU|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Plur', 'lemma': 'arbetsuppgift', 'dep': 'nsubj:pass', 'head': 1}, {'id': 1, 'start': 16, 'end': 22, 'tag': 'VB|SUP|SFO', 'pos': 'VERB', 'morph': 'VerbForm=Sup|Voice=Pass', 'lemma': 'ta', 'dep': 'root', 'head': 1}, {'id': 2, 'start': 23, 'end': 27, 'tag': 'PL', 'pos': 'ADV', 'morph': '', 'lemma': 'bort', 'dep': 'compound:prt', 'head': 1}]}, {'text': 'administrativa uppgifterna har eller har ökat', 'ents': [], 'sents': [{'start': 0, 'end': 45}], 'tokens': [{'id': 0, 'start': 0, 'end': 14, 'tag': 'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM', 'pos': 'ADJ', 'morph': 'Case=Nom|Degree=Pos|Number=Plur', 'lemma': 'administrativ', 'dep': 'amod', 'head': 1}, {'id': 1, 'start': 15, 'end': 26, 'tag': 'NN|UTR|PLU|DEF|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Def|Gender=Com|Number=Plur', 'lemma': 'uppgift', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 27, 'end': 30, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'ha', 'dep': 'root', 'head': 2}, {'id': 3, 'start': 31, 'end': 36, 'tag': 'KN', 'pos': 'CCONJ', 'morph': '', 'lemma': 'eller', 'dep': 'cc', 'head': 5}, {'id': 4, 'start': 37, 'end': 40, 'tag': 'VB|PRS|AKT', 'pos': 'AUX', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'ha', 'dep': 'aux', 'head': 5}, {'id': 5, 'start': 41, 'end': 45, 'tag': 'VB|SUP|AKT', 'pos': 'VERB', 'morph': 'VerbForm=Sup|Voice=Act', 'lemma': 'öka', 'dep': 'conj', 'head': 2}]}, {'text': 'administrativa uppgifterna ökat', 'ents': [], 'sents': [{'start': 0, 'end': 31}], 'tokens': [{'id': 0, 'start': 0, 'end': 14, 'tag': 'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM', 'pos': 'ADJ', 'morph': 'Case=Nom|Degree=Pos|Number=Plur', 'lemma': 'administrativ', 'dep': 'amod', 'head': 1}, {'id': 1, 'start': 15, 'end': 26, 'tag': 'NN|UTR|PLU|DEF|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Def|Gender=Com|Number=Plur', 'lemma': 'uppgift', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 27, 'end': 31, 'tag': 'VB|SUP|AKT', 'pos': 'VERB', 'morph': 'VerbForm=Sup|Voice=Act', 'lemma': 'öka', 'dep': 'root', 'head': 2}]}, {'text': 'Och det tror jag inte det råder någon oenighet om här.', 'ents': [], 'sents': [{'start': 0, 'end': 54}], 'tokens': [{'id': 0, 'start': 0, 'end': 3, 'tag': 'KN', 'pos': 'CCONJ', 'morph': '', 'lemma': 'och', 'dep': 'cc', 'head': 2}, {'id': 1, 'start': 4, 'end': 7, 'tag': 'PN|NEU|SIN|DEF|SUB/OBJ', 'pos': 'PRON', 'morph': 'Definite=Def|Gender=Neut|Number=Sing|PronType=Prs', 'lemma': 'den', 'dep': 'obj', 'head': 2}, {'id': 2, 'start': 8, 'end': 12, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'tro', 'dep': 'root', 'head': 2}, {'id': 3, 'start': 13, 'end': 16, 'tag': 'PN|UTR|SIN|DEF|SUB', 'pos': 'PRON', 'morph': 'Case=Nom|Definite=Def|Gender=Com|Number=Sing|PronType=Prs', 'lemma': 'jag', 'dep': 'nsubj', 'head': 2}, {'id': 4, 'start': 17, 'end': 21, 'tag': 'AB', 'pos': 'PART', 'morph': 'Polarity=Neg', 'lemma': 'inte', 'dep': 'advmod', 'head': 2}, {'id': 5, 'start': 22, 'end': 25, 'tag': 'PN|NEU|SIN|DEF|SUB/OBJ', 'pos': 'PRON', 'morph': 'Definite=Def|Gender=Neut|Number=Sing|PronType=Prs', 'lemma': 'den', 'dep': 'expl', 'head': 6}, {'id': 6, 'start': 26, 'end': 31, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'råda', 'dep': 'ccomp', 'head': 2}, {'id': 7, 'start': 32, 'end': 37, 'tag': 'DT|UTR|SIN|IND', 'pos': 'DET', 'morph': 'Definite=Ind|Gender=Com|Number=Sing|PronType=Ind', 'lemma': 'någon', 'dep': 'det', 'head': 8}, {'id': 8, 'start': 38, 'end': 46, 'tag': 'NN|UTR|SIN|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing', 'lemma': 'oenighet', 'dep': 'nsubj', 'head': 6}, {'id': 9, 'start': 47, 'end': 49, 'tag': 'PP', 'pos': 'ADP', 'morph': '', 'lemma': 'om', 'dep': 'advmod', 'head': 6}, {'id': 10, 'start': 50, 'end': 53, 'tag': 'AB', 'pos': 'ADV', 'morph': '', 'lemma': 'här', 'dep': 'fixed', 'head': 9}, {'id': 11, 'start': 53, 'end': 54, 'tag': 'MAD', 'pos': 'PUNCT', 'morph': '', 'lemma': '.', 'dep': 'punct', 'head': 2}]}, {'text': 'Och jag tror inte att det här råder någon oenighet om det.', 'ents': [], 'sents': [{'start': 0, 'end': 58}], 'tokens': [{'id': 0, 'start': 0, 'end': 3, 'tag': 'KN', 'pos': 'CCONJ', 'morph': '', 'lemma': 'och', 'dep': 'cc', 'head': 2}, {'id': 1, 'start': 4, 'end': 7, 'tag': 'PN|UTR|SIN|DEF|SUB', 'pos': 'PRON', 'morph': 'Case=Nom|Definite=Def|Gender=Com|Number=Sing|PronType=Prs', 'lemma': 'jag', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 8, 'end': 12, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'tro', 'dep': 'root', 'head': 2}, {'id': 3, 'start': 13, 'end': 17, 'tag': 'AB', 'pos': 'PART', 'morph': 'Polarity=Neg', 'lemma': 'inte', 'dep': 'advmod', 'head': 2}, {'id': 4, 'start': 18, 'end': 21, 'tag': 'SN', 'pos': 'SCONJ', 'morph': '', 'lemma': 'att', 'dep': 'mark', 'head': 7}, {'id': 5, 'start': 22, 'end': 25, 'tag': 'PN|NEU|SIN|DEF|SUB/OBJ', 'pos': 'PRON', 'morph': 'Definite=Def|Gender=Neut|Number=Sing|PronType=Prs', 'lemma': 'den', 'dep': 'expl', 'head': 7}, {'id': 6, 'start': 26, 'end': 29, 'tag': 'AB', 'pos': 'ADV', 'morph': '', 'lemma': 'här', 'dep': 'fixed', 'head': 5}, {'id': 7, 'start': 30, 'end': 35, 'tag': 'VB|PRS|AKT', 'pos': 'VERB', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act', 'lemma': 'råda', 'dep': 'ccomp', 'head': 2}, {'id': 8, 'start': 36, 'end': 41, 'tag': 'DT|UTR|SIN|IND', 'pos': 'DET', 'morph': 'Definite=Ind|Gender=Com|Number=Sing|PronType=Ind', 'lemma': 'någon', 'dep': 'det', 'head': 9}, {'id': 9, 'start': 42, 'end': 50, 'tag': 'NN|UTR|SIN|IND|NOM', 'pos': 'NOUN', 'morph': 'Case=Nom|Definite=Ind|Gender=Com|Number=Sing', 'lemma': 'oenighet', 'dep': 'nsubj', 'head': 7}, {'id': 10, 'start': 51, 'end': 53, 'tag': 'PP', 'pos': 'ADP', 'morph': '', 'lemma': 'om', 'dep': 'case', 'head': 11}, {'id': 11, 'start': 54, 'end': 57, 'tag': 'PN|NEU|SIN|DEF|SUB/OBJ', 'pos': 'PRON', 'morph': 'Definite=Def|Gender=Neut|Number=Sing|PronType=Prs', 'lemma': 'den', 'dep': 'obl', 'head': 7}, {'id': 12, 'start': 57, 'end': 58, 'tag': 'MAD', 'pos': 'PUNCT', 'morph': '', 'lemma': '.', 'dep': 'punct', 'head': 2}]}]
import stanza
from stanza.utils.conll import CoNLL
nlp = stanza.Pipeline('sv', processors='tokenize, pos, lemma, depparse')
doc = nlp('Du har mitt ord.') # doc is class Document
CoNLL.write_doc2conll(doc, "doc2.conllu")
INFO:stanza:Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES INFO:stanza:Loading these models for language: sv (Swedish): ================================== | Processor | Package | ---------------------------------- | tokenize | talbanken | | pos | talbanken_charlm | | lemma | talbanken_nocharlm | | depparse | talbanken_charlm | ================================== INFO:stanza:Using device: cpu INFO:stanza:Loading: tokenize INFO:stanza:Loading: pos INFO:stanza:Loading: lemma /usr/local/lib/python3.10/dist-packages/stanza/models/lemma/trainer.py:227: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: depparse /usr/local/lib/python3.10/dist-packages/stanza/models/depparse/trainer.py:103: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Done loading processors!
!cat doc2.conllu
# text = Du har mitt ord. # sent_id = 0 1 Du du PRON PN|UTR|SIN|DEF|SUB Case=Nom|Definite=Def|Gender=Com|Number=Sing|PronType=Prs 2 nsubj _ start_char=0|end_char=2 2 har ha VERB VB|PRS|AKT Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ start_char=3|end_char=6 3 mitt jag PRON PS|NEU|SIN|DEF Definite=Def|Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs 4 nmod:poss _ start_char=7|end_char=11 4 ord ord NOUN NN|NEU|SIN|IND|NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Sing 2 obj _ start_char=12|end_char=15 5 . . PUNCT MAD _ 2 punct _ start_char=15|end_char=16
import stanza
from stanza.utils.conll import CoNLL
nlp_ga = stanza.Pipeline('ga', processors='tokenize, pos, lemma, depparse')
doc_ga = nlp_ga("áthas a bheith air")
CoNLL.write_doc2conll(doc_ga, "docga.conllu")
INFO:stanza:Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES INFO:stanza:Loading these models for language: ga (Irish): ============================ | Processor | Package | ---------------------------- | tokenize | idt | | pos | idt_nocharlm | | lemma | idt_nocharlm | | depparse | idt_nocharlm | ============================ INFO:stanza:Using device: cpu INFO:stanza:Loading: tokenize INFO:stanza:Loading: pos INFO:stanza:Loading: lemma INFO:stanza:Loading: depparse INFO:stanza:Done loading processors!
!cat docga.conllu
# text = áthas a bheith air # sent_id = 0 1 áthas áthas NOUN Noun Case=Nom|Gender=Masc|Number=Sing 3 obj _ start_char=0|end_char=5 2 a a PART Inf PartType=Inf 3 mark _ start_char=6|end_char=7 3 bheith bheith NOUN Noun Form=Len|VerbForm=Inf 0 root _ start_char=8|end_char=14 4 air ar ADP Prep Gender=Masc|Number=Sing|Person=3 3 obl:prep _ start_char=15|end_char=18
SAMPLE = """26.—AN GHRÁINNEÓG AGUS NA HAITHREACHA NIMHE.
Tháinig an ghráinneóg go doras pluaise na n-aithreach nimhe oíche sheaca, agus d’ iar sí bheith istigh orthu. Do ligeadar isteach í ó bhí an oíche chomh fuar. Shocruig sí í fein ar lár an urláir agus dhein sí liarthóid di fein, agus shín sí na deilgne amach mórthimpeall uirthi féin, i dtreo ná féadfadh aon duine dul na goire. Ní fhéadadh athair nimhe gabháil tháirse, síos ná suas, ná prioctí é.
“Féach!” ar siad, “tá an phluais seo beag ár ndóithin againne féin. Ní foláir duitse bheith ag imeacht.”
“Ambasa,” ar sise, ”’an té leis gur cúmhang fágadh!’ Níl aon locht agamsa ar an áit seo.”
An Múineadh.
“Ní haitheantas go haontígheas.”
Cheap na haithreacha nimhe ná bheadh aon bhac orthu an ghráinneóg do chur amach pé uair ba mhaith leo é, nó fiacal nimhe do chur inti. Níor chuímhnígheadar riamh ar na deilgníbh."""
SAMPLE_ORIG = """26.—AN ĠRÁINNEÓG AGUS NA H-AIṪREAĊA NÍṀE.
Ṫáinig an ġráinneóg go dorus pluaise na n-aiṫreaċ níṁe oíḋċe ṡeaca, agus d’ iar sí ḃeiṫ istiġ orṫa. Do leigeadar isteaċ í ó ḃí an oíḋċe ċóṁ fuar. Ṡocruig sí í fein ar lár an úrláir agus ḋein sí liarṫóid dí fein, agus ṡín sí na deilgne amaċ mór-ṫímpal uirṫi féin, i dtreó ná féadfaḋ aoinne dul na goire. Ní ḟéadaḋ aṫair níṁe gaḃáil ṫáirse, síos ná suas, ná prioctí é.
“Feuċ!” ar siad, “tá an ṗluais seo beag ár ndóiṫin againne féin. Ní foláir duitse ḃeiṫ ag imṫeaċt.”
“Ambasa,” ar sise, “‘an t-é leis gur cúṁang fágaḋ!’ Ní’l aon loċt agamsa ar an áit seo.”
An Múineaḋ.
“Ní h-aiṫeantas go h-aontíġeas.”
Ċeap na h-aiṫreaċa níṁe ná ḃéaḋ aon ḃac orṫa an ġráinneóg do ċur amaċ pé uair ba ṁaiṫ leo é, nó fiacal níṁe do ċur inti. Níor ċuíṁníġeadar riaṁ ar na deilgníḃ."""
SAMPLE_CAI = """26. — AN GHRÁINNEOG AGUS NA NATHRACHA NIMHE.
Tháinig an ghráinneog go doras pluaise na nathrach nimhe oíche sheaca, agus d'iarr sí bheith istigh orthu. Ligeadar isteach í ó bhí an oíche chomh fuar. Shocraigh sí í féin ar lár an urláir agus rinne sí liarthóid di féin, agus shín sí na deilgne amach mórthimpeall uirthi féin, i dtreo nach bhféadfadh aon duine dul ina gaire. Ní fhéadadh nathair nimhe gabháil thairsti, síos ná suas, ná prioctí é.
“Féach!” ar siad, “tá an phluais seo beag ár ndóthain againne féin. Ní foláir duitse bheith ag imeacht.”
“Ambaiste,” ar sise, “‘an té leis gur cúng fágadh!’ Níl aon locht agamsa ar an áit seo.”
An Múineadh.
“Ní haitheantas go haontíos.”
Cheap na nathracha nimhe ná beadh aon bhac orthu an ghráinneog a chur amach pé uair ba mhaith leo é, nó fiacla nimhe a chur inti. Níor chuimhníodar riamh ar na deilgní."""
!pip install stanza
import stanza
from stanza.utils.conll import CoNLL
nlp_pl = stanza.Pipeline('pl', processors='tokenize, pos, lemma, depparse')
doc_pl = nlp_pl("Wolne Miasto Gdańsk (niem. Freie Stadt Danzig) – istniejące w okresie międzywojennym autonomiczne miasto-państwo, pod ochroną Ligi Narodów.")
CoNLL.write_doc2conll(doc_pl, "docpl.conllu")
INFO:stanza:Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES INFO:stanza:Downloaded file to /root/stanza_resources/resources.json WARNING:stanza:Language pl package default expects mwt, which has been added INFO:stanza:Loading these models for language: pl (Polish): ============================ | Processor | Package | ---------------------------- | tokenize | pdb | | mwt | pdb | | pos | pdb_charlm | | lemma | pdb_nocharlm | | depparse | pdb_charlm | ============================ INFO:stanza:Using device: cpu INFO:stanza:Loading: tokenize /usr/local/lib/python3.10/dist-packages/stanza/models/tokenization/trainer.py:82: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: mwt /usr/local/lib/python3.10/dist-packages/stanza/models/mwt/trainer.py:201: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: pos /usr/local/lib/python3.10/dist-packages/stanza/models/pos/trainer.py:139: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) /usr/local/lib/python3.10/dist-packages/stanza/models/common/pretrain.py:56: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. data = torch.load(self.filename, lambda storage, loc: storage) /usr/local/lib/python3.10/dist-packages/stanza/models/common/char_model.py:271: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. state = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: lemma /usr/local/lib/python3.10/dist-packages/stanza/models/lemma/trainer.py:239: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: depparse /usr/local/lib/python3.10/dist-packages/stanza/models/depparse/trainer.py:194: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Done loading processors!
!cat docpl.conllu
# text = Wolne Miasto Gdańsk (niem. Freie Stadt Danzig) – istniejące w okresie międzywojennym autonomiczne miasto-państwo, pod ochroną Ligi Narodów. # sent_id = 0 1 Wolne wolny ADJ adj:sg:nom:n:pos Case=Nom|Degree=Pos|Gender=Neut|Number=Sing 2 amod:flat _ start_char=0|end_char=5 2 Miasto miasto NOUN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 0 root _ start_char=6|end_char=12 3 Gdańsk Gdańsk PROPN subst:sg:nom:m3 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 2 flat _ start_char=13|end_char=19|SpacesAfter=\u00A0 4 ( ( PUNCT interp PunctSide=Ini|PunctType=Brck 5 punct _ start_char=20|end_char=21|SpaceAfter=No 5 niem niem X brev:pun Abbr=Yes|Pun=Yes 2 nmod _ start_char=21|end_char=25|SpaceAfter=No 6 . . PUNCT interp PunctType=Peri 5 punct _ start_char=25|end_char=26|SpacesAfter=\u00A0 7 Freie Frea PROPN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 5 flat _ start_char=27|end_char=32 8 Stadt Stadt PROPN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 7 flat _ start_char=33|end_char=38 9 Danzig Danzig PROPN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 8 flat _ start_char=39|end_char=45|SpaceAfter=No 10 ) ) PUNCT interp PunctSide=Fin|PunctType=Brck 5 punct _ start_char=45|end_char=46 11 – – PUNCT interp PunctType=Dash 17 punct _ start_char=47|end_char=48 12 istniejące istnieć ADJ pact:sg:nom:n:imperf:aff Aspect=Imp|Case=Nom|Gender=Neut|Number=Sing|Polarity=Pos|VerbForm=Part|Voice=Act 17 acl _ start_char=49|end_char=59 13 w w ADP prep:loc:nwok AdpType=Prep|Variant=Short 14 case _ start_char=60|end_char=61|SpacesAfter=\u00A0 14 okresie okres NOUN subst:sg:loc:m3 Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing 12 obl _ start_char=62|end_char=69 15 międzywojennym międzywojenny ADJ adj:sg:loc:m3:pos Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing 14 amod _ start_char=70|end_char=84|SpacesAfter=\u00A0 16 autonomiczne autonomiczny ADJ adj:sg:nom:n:pos Case=Nom|Degree=Pos|Gender=Neut|Number=Sing 17 amod _ start_char=85|end_char=97|SpacesAfter=\u00A0 17 miasto miasto NOUN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 2 appos _ start_char=98|end_char=104|SpaceAfter=No 18 - - PUNCT interp PunctType=Dash 19 punct _ start_char=104|end_char=105|SpaceAfter=No 19 państwo państwo NOUN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 17 appos _ start_char=105|end_char=112|SpaceAfter=No 20 , , PUNCT interp PunctType=Comm 22 punct _ start_char=112|end_char=113 21 pod pod ADP prep:inst:nwok AdpType=Prep|Variant=Short 22 case _ start_char=114|end_char=117 22 ochroną ochrona NOUN subst:sg:inst:f Case=Ins|Gender=Fem|Number=Sing 17 nmod _ start_char=118|end_char=125|SpacesAfter=\u00A0 23 Ligi liga NOUN subst:sg:gen:f Case=Gen|Gender=Fem|Number=Sing 22 nmod:arg _ start_char=126|end_char=130 24 Narodów naród NOUN subst:pl:gen:m3 Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur 23 nmod:flat _ start_char=131|end_char=138|SpaceAfter=No 25 . . PUNCT interp PunctType=Peri 2 punct _ start_char=138|end_char=139|SpaceAfter=No
doc_pl = nlp_pl("Czy może to tak być? Albo biało, albo czarno. Czerwony jak krasnoludek. Lekki jak piórko. Jakim sposobem się tam znalazłeś? Czy byłeś w domu? Jest tata w domu? My jesteśmy Kaszubami. My jesteśmy Kaszubami. język kaszubski. Wiem, że ona je teraz obiad. Ona jest Polką. On by nic nie robił, tylko jadł i spał. Chciałem do miasta jechać z rana, alem zaspał. Mam tylko wodę w tej butelce. Nie śpij teraz! Śpiący kot myszy nie łapie. Moniki kot rozbił dzbanek.")
CoNLL.write_doc2conll(doc_pl, "docpl.conllu")
!cat docpl.conllu
# text = Czy może to tak być? # sent_id = 0 1 Czy czy PART part PartType=Int 2 advmod _ start_char=0|end_char=3|SpacesAfter=\u00A0 2 może móc VERB fin:sg:ter:imperf Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ start_char=4|end_char=8|SpacesAfter=\u00A0 3 to to PRON subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing|PronType=Dem 2 nsubj _ start_char=9|end_char=11|SpacesAfter=\u00A0 4 tak tak ADV adv:pos Degree=Pos|PronType=Dem 5 advmod _ start_char=12|end_char=15|SpacesAfter=\u00A0 5 być być VERB inf:imperf Aspect=Imp|VerbForm=Inf|Voice=Act 2 xcomp _ start_char=16|end_char=19|SpaceAfter=No 6 ? ? PUNCT interp PunctType=Qest 2 punct _ start_char=19|end_char=20 # text = Albo biało, albo czarno. # sent_id = 1 1 Albo albo CCONJ conj _ 2 cc _ start_char=21|end_char=25|SpacesAfter=\u00A0 2 biało biały ADJ adja Hyph=Yes 0 root _ start_char=26|end_char=31|SpaceAfter=No 3 , , PUNCT interp PunctType=Comm 5 punct _ start_char=31|end_char=32|SpacesAfter=\u00A0 4 albo albo CCONJ conj _ 5 cc _ start_char=33|end_char=37|SpacesAfter=\u00A0 5 czarno czarny ADJ adja Hyph=Yes 2 conj _ start_char=38|end_char=44|SpaceAfter=No 6 . . PUNCT interp PunctType=Peri 2 punct _ start_char=44|end_char=45 # text = Czerwony jak krasnoludek. # sent_id = 2 1 Czerwony czerwony ADJ adj:sg:nom:m2:pos Animacy=Nhum|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing 0 root _ start_char=46|end_char=54|SpacesAfter=\u00A0 2 jak jak SCONJ comp ConjType=Comp 3 mark _ start_char=55|end_char=58|SpacesAfter=\u00A0 3 krasnoludek krasnoludka NOUN subst:sg:nom:m2 Animacy=Nhum|Case=Nom|Gender=Masc|Number=Sing 1 obl:cmpr _ start_char=59|end_char=70|SpaceAfter=No 4 . . PUNCT interp PunctType=Peri 1 punct _ start_char=70|end_char=71 # text = Lekki jak piórko. # sent_id = 3 1 Lekki lekki ADJ adj:sg:nom:m3:pos Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing 0 root _ start_char=72|end_char=77|SpacesAfter=\u00A0 2 jak jak SCONJ comp ConjType=Comp 3 mark _ start_char=78|end_char=81|SpacesAfter=\u00A0 3 piórko piórko NOUN subst:sg:nom:n:ncol Case=Nom|Gender=Neut|Number=Sing 1 obl:cmpr _ start_char=82|end_char=88|SpaceAfter=No 4 . . PUNCT interp PunctType=Peri 1 punct _ start_char=88|end_char=89 # text = Jakim sposobem się tam znalazłeś? # sent_id = 4 1 Jakim jaki DET adj:sg:inst:m3:pos Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing|PronType=Int 2 det _ start_char=90|end_char=95|SpacesAfter=\u00A0 2 sposobem sposób NOUN subst:sg:inst:m3 Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing 5 obl _ start_char=96|end_char=104|SpacesAfter=\u00A0 3 się się PRON part PronType=Prs|Reflex=Yes 5 expl:pv _ start_char=105|end_char=108|SpacesAfter=\u00A0 4 tam tam ADV adv PronType=Dem 5 advmod _ start_char=109|end_char=112|SpacesAfter=\u00A0 5-6 znalazłeś _ _ _ _ _ _ _ start_char=113|end_char=122|SpaceAfter=No 5 znalazł znaleźć VERB praet:sg:m1:perf Animacy=Hum|Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ start_char=113|end_char=120 6 eś być AUX aglt:sg:sec:imperf:wok Aspect=Imp|Clitic=Yes|Number=Sing|Person=2|Variant=Long 5 aux:clitic _ start_char=120|end_char=122 7 ? ? PUNCT interp PunctType=Qest 5 punct _ start_char=122|end_char=123 # text = Czy byłeś w domu? # sent_id = 5 1 Czy czy PART part PartType=Int 2 advmod _ start_char=124|end_char=127|SpacesAfter=\u00A0 2-3 byłeś _ _ _ _ _ _ _ start_char=128|end_char=133|SpacesAfter=\u00A0 2 był być VERB praet:sg:m1:imperf Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ start_char=128|end_char=131 3 eś być AUX aglt:sg:sec:imperf:wok Aspect=Imp|Clitic=Yes|Number=Sing|Person=2|Variant=Long 2 aux:clitic _ start_char=131|end_char=133 4 w w ADP prep:loc:nwok AdpType=Prep|Variant=Short 5 case _ start_char=134|end_char=135|SpacesAfter=\u00A0 5 domu dom NOUN subst:sg:loc:m3 Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing 2 obl _ start_char=136|end_char=140|SpaceAfter=No 6 ? ? PUNCT interp PunctType=Qest 2 punct _ start_char=140|end_char=141 # text = Jest tata w domu? # sent_id = 6 1 Jest być VERB fin:sg:ter:imperf Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ start_char=142|end_char=146|SpacesAfter=\u00A0 2 tata tata NOUN subst:sg:nom:m1 Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing 1 nsubj _ start_char=147|end_char=151|SpacesAfter=\u00A0 3 w w ADP prep:loc:nwok AdpType=Prep|Variant=Short 4 case _ start_char=152|end_char=153|SpacesAfter=\u00A0 4 domu dom NOUN subst:sg:loc:m3 Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing 1 obl _ start_char=154|end_char=158|SpaceAfter=No 5 ? ? PUNCT interp PunctType=Qest 1 punct _ start_char=158|end_char=159 # text = My jesteśmy Kaszubami. # sent_id = 7 1 My my PRON ppron12:pl:nom:m1:pri Animacy=Hum|Case=Nom|Gender=Masc|Number=Plur|Person=1|PronType=Prs 3 nsubj _ start_char=160|end_char=162|SpacesAfter=\u00A0 2 jesteśmy być AUX fin:pl:pri:imperf Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act 3 cop _ start_char=163|end_char=171|SpacesAfter=\u00A0 3 Kaszubami Kaszuby PROPN subst:pl:inst:m1 Animacy=Hum|Case=Ins|Gender=Masc|Number=Plur 0 root _ start_char=172|end_char=181|SpaceAfter=No 4 . . PUNCT interp PunctType=Peri 3 punct _ start_char=181|end_char=182 # text = My jesteśmy Kaszubami. # sent_id = 8 1 My my PRON ppron12:pl:nom:m1:pri Animacy=Hum|Case=Nom|Gender=Masc|Number=Plur|Person=1|PronType=Prs 3 nsubj _ start_char=183|end_char=185|SpacesAfter=\u00A0 2 jesteśmy być AUX fin:pl:pri:imperf Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act 3 cop _ start_char=186|end_char=194|SpacesAfter=\u00A0 3 Kaszubami Kaszuby PROPN subst:pl:inst:m1 Animacy=Hum|Case=Ins|Gender=Masc|Number=Plur 0 root _ start_char=195|end_char=204|SpaceAfter=No 4 . . PUNCT interp PunctType=Peri 3 punct _ start_char=204|end_char=205|SpacesAfter=\s\u00A0 # text = język kaszubski. # sent_id = 9 1 język język NOUN subst:sg:nom:m3 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 0 root _ start_char=207|end_char=212|SpacesAfter=\u00A0 2 kaszubski kaszubski ADJ adj:sg:nom:m3:pos Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing 1 amod _ start_char=213|end_char=222|SpaceAfter=No 3 . . PUNCT interp PunctType=Peri 1 punct _ start_char=222|end_char=223 # text = Wiem, że ona je teraz obiad. # sent_id = 10 1 Wiem wiedzieć VERB fin:sg:pri:imperf Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ start_char=224|end_char=228|SpaceAfter=No 2 , , PUNCT interp PunctType=Comm 7 punct _ start_char=228|end_char=229|SpacesAfter=\u00A0 3 że że SCONJ comp _ 7 mark _ start_char=230|end_char=232|SpacesAfter=\u00A0 4 ona on PRON ppron3:sg:nom:f:ter:akc:npraep Case=Nom|Gender=Fem|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|Variant=Long 7 nsubj _ start_char=233|end_char=236|SpacesAfter=\u00A0 5 je on PRON ppron3:pl:acc:f:ter:akc:npraep Case=Acc|Gender=Neut|Number=Plur|Person=3|PrepCase=Npr|PronType=Prs|Variant=Long 7 obj _ start_char=237|end_char=239|SpacesAfter=\u00A0 6 teraz teraz ADV adv _ 7 advmod _ start_char=240|end_char=245|SpacesAfter=\u00A0 7 obiad obiad NOUN subst:sg:acc:m3 Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 1 ccomp _ start_char=246|end_char=251|SpaceAfter=No 8 . . PUNCT interp PunctType=Peri 1 punct _ start_char=251|end_char=252 # text = Ona jest Polką. # sent_id = 11 1 Ona on PRON ppron3:sg:nom:f:ter:akc:npraep Case=Nom|Gender=Fem|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|Variant=Long 3 nsubj _ start_char=253|end_char=256|SpacesAfter=\u00A0 2 jest być AUX fin:sg:ter:imperf Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 3 cop _ start_char=257|end_char=261|SpacesAfter=\u00A0 3 Polką Polka PROPN subst:sg:inst:f Case=Ins|Gender=Fem|Number=Sing 0 root _ start_char=262|end_char=267|SpaceAfter=No 4 . . PUNCT interp PunctType=Peri 3 punct _ start_char=267|end_char=268 # text = On by nic nie robił, tylko jadł i spał. # sent_id = 12 1 On on PRON ppron3:sg:nom:m1:ter:akc:npraep Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|Variant=Long 5 nsubj _ start_char=269|end_char=271|SpacesAfter=\u00A0 2 by by AUX part _ 5 aux:cnd _ start_char=272|end_char=274|SpacesAfter=\u00A0 3 nic nic PRON subst:sg:gen:n:ncol Case=Gen|Gender=Neut|Number=Sing|PronType=Neg 5 obj _ start_char=275|end_char=278|SpacesAfter=\u00A0 4 nie nie PART part Polarity=Neg 5 advmod:neg _ start_char=279|end_char=282|SpacesAfter=\u00A0 5 robił robić VERB praet:sg:m1:imperf Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ start_char=283|end_char=288|SpaceAfter=No 6 , , PUNCT interp PunctType=Comm 8 punct _ start_char=288|end_char=289|SpacesAfter=\u00A0 7 tylko tylko CCONJ conj _ 8 cc _ start_char=290|end_char=295|SpacesAfter=\u00A0 8 jadł jeść VERB praet:sg:m1:imperf Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 5 conj _ start_char=296|end_char=300|SpacesAfter=\u00A0 9 i i CCONJ conj _ 10 cc _ start_char=301|end_char=302|SpacesAfter=\u00A0 10 spał spać VERB praet:sg:m1:imperf Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 8 conj _ start_char=303|end_char=307|SpaceAfter=No 11 . . PUNCT interp PunctType=Peri 5 punct _ start_char=307|end_char=308 # text = Chciałem do miasta jechać z rana, alem zaspał. # sent_id = 13 1-2 Chciałem _ _ _ _ _ _ _ start_char=309|end_char=317|SpacesAfter=\u00A0 1 Chciał chcieć VERB praet:sg:m1:imperf Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ start_char=309|end_char=315 2 em być AUX aglt:sg:pri:imperf:wok Aspect=Imp|Clitic=Yes|Number=Sing|Person=1|Variant=Long 1 aux:clitic _ start_char=315|end_char=317 3 do do ADP prep:gen AdpType=Prep 4 case _ start_char=318|end_char=320|SpacesAfter=\u00A0 4 miasta miasto NOUN subst:sg:gen:n:ncol Case=Gen|Gender=Neut|Number=Sing 5 obl _ start_char=321|end_char=327|SpacesAfter=\u00A0 5 jechać jechać VERB inf:imperf Aspect=Imp|VerbForm=Inf|Voice=Act 1 xcomp _ start_char=328|end_char=334|SpacesAfter=\u00A0 6 z z ADP prep:gen:nwok AdpType=Prep|Variant=Short 7 case _ start_char=335|end_char=336|SpacesAfter=\u00A0 7 rana rano NOUN subst:sg:gen:n:ncol Case=Gen|Gender=Neut|Number=Sing 5 obl _ start_char=337|end_char=341|SpaceAfter=No 8 , , PUNCT interp PunctType=Comm 10 punct _ start_char=341|end_char=342|SpacesAfter=\u00A0 9 alem alem CCONJ conj _ 10 cc _ start_char=343|end_char=347|SpacesAfter=\u00A0 10 zaspał zaspać VERB praet:sg:m1:perf Animacy=Hum|Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 1 conj _ start_char=348|end_char=354|SpaceAfter=No 11 . . PUNCT interp PunctType=Peri 1 punct _ start_char=354|end_char=355 # text = Mam tylko wodę w tej butelce. # sent_id = 14 1 Mam mieć VERB fin:sg:pri:imperf Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ start_char=356|end_char=359|SpacesAfter=\u00A0 2 tylko tylko PART part _ 3 advmod:emph _ start_char=360|end_char=365|SpacesAfter=\u00A0 3 wodę woda NOUN subst:sg:acc:f Case=Acc|Gender=Fem|Number=Sing 1 iobj _ start_char=366|end_char=370|SpacesAfter=\u00A0 4 w w ADP prep:loc:nwok AdpType=Prep|Variant=Short 6 case _ start_char=371|end_char=372|SpacesAfter=\u00A0 5 tej ten DET adj:sg:loc:f:pos Case=Loc|Gender=Fem|Number=Sing|PronType=Dem 6 det _ start_char=373|end_char=376|SpacesAfter=\u00A0 6 butelce butelka NOUN subst:sg:loc:f Case=Loc|Gender=Fem|Number=Sing 1 obl _ start_char=377|end_char=384|SpaceAfter=No 7 . . PUNCT interp PunctType=Peri 1 punct _ start_char=384|end_char=385 # text = Nie śpij teraz! # sent_id = 15 1 Nie nie PART part Polarity=Neg 2 advmod:neg _ start_char=386|end_char=389|SpacesAfter=\u00A0 2 śpij śpić VERB impt:sg:sec:imperf Aspect=Imp|Mood=Imp|Number=Sing|Person=2|VerbForm=Fin|Voice=Act 0 root _ start_char=390|end_char=394|SpacesAfter=\u00A0 3 teraz teraz ADV adv _ 2 advmod _ start_char=395|end_char=400|SpaceAfter=No 4 ! ! PUNCT interp PunctType=Excl 2 punct _ start_char=400|end_char=401 # text = Śpiący kot myszy nie łapie. # sent_id = 16 1 Śpiący śpiący ADJ adj:sg:nom:m2:pos:aff Animacy=Nhum|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing 2 acl _ start_char=402|end_char=408|SpacesAfter=\u00A0 2 kot kot NOUN subst:sg:nom:m2 Animacy=Nhum|Case=Nom|Gender=Masc|Number=Sing 5 nsubj _ start_char=409|end_char=412|SpacesAfter=\u00A0 3 myszy mysz NOUN subst:sg:gen:f Case=Gen|Gender=Fem|Number=Sing 2 nmod:arg _ start_char=413|end_char=418|SpacesAfter=\u00A0 4 nie nie PART part Polarity=Neg 5 advmod:neg _ start_char=419|end_char=422|SpacesAfter=\u00A0 5 łapie łapać VERB fin:sg:ter:imperf Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ start_char=423|end_char=428|SpaceAfter=No 6 . . PUNCT interp PunctType=Peri 5 punct _ start_char=428|end_char=429 # text = Moniki kot rozbił dzbanek. # sent_id = 17 1 Moniki Monika PROPN subst:sg:nom:f Case=Nom|Gender=Fem|Number=Sing 3 nsubj _ start_char=430|end_char=436|SpacesAfter=\u00A0 2 kot kot NOUN subst:sg:nom:m2 Animacy=Nhum|Case=Nom|Gender=Masc|Number=Sing 1 flat _ start_char=437|end_char=440|SpacesAfter=\u00A0 3 rozbił rozbić VERB praet:sg:m3:perf Animacy=Inan|Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ start_char=441|end_char=447|SpacesAfter=\u00A0 4 dzbanek dzbanek NOUN subst:sg:acc:m3 Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 3 obj _ start_char=448|end_char=455|SpaceAfter=No 5 . . PUNCT interp PunctType=Peri 3 punct _ start_char=455|end_char=456|SpaceAfter=No
nlp_cs = stanza.Pipeline('cs', processors='tokenize, pos, lemma, depparse')
doc_cs = nlp_cs("Moničina kočka rozbila džbán.")
CoNLL.write_doc2conll(doc_cs, "docpl.conllu")
INFO:stanza:Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES INFO:stanza:Downloaded file to /root/stanza_resources/resources.json WARNING:stanza:Language cs package default expects mwt, which has been added INFO:stanza:Loading these models for language: cs (Czech): ============================ | Processor | Package | ---------------------------- | tokenize | pdt | | mwt | pdt | | pos | pdt_nocharlm | | lemma | pdt_nocharlm | | depparse | pdt_nocharlm | ============================ INFO:stanza:Using device: cpu INFO:stanza:Loading: tokenize /usr/local/lib/python3.10/dist-packages/stanza/models/tokenization/trainer.py:82: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: mwt /usr/local/lib/python3.10/dist-packages/stanza/models/mwt/trainer.py:201: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: pos /usr/local/lib/python3.10/dist-packages/stanza/models/pos/trainer.py:139: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) /usr/local/lib/python3.10/dist-packages/stanza/models/common/pretrain.py:56: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. data = torch.load(self.filename, lambda storage, loc: storage) INFO:stanza:Loading: lemma /usr/local/lib/python3.10/dist-packages/stanza/models/lemma/trainer.py:239: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Loading: depparse /usr/local/lib/python3.10/dist-packages/stanza/models/depparse/trainer.py:194: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(filename, lambda storage, loc: storage) INFO:stanza:Done loading processors!
!cat docpl.conllu
# text = Monikin džbán. # sent_id = 0 1 Monikin Monikin PROPN NNIS1-----A---- Animacy=Inan|Case=Nom|Gender=Masc|NameType=Oth|Number=Sing|Polarity=Pos 0 root _ start_char=0|end_char=7 2 džbán džbán NOUN NNIS1-----A---- Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing|Polarity=Pos 1 nmod _ start_char=8|end_char=13|SpaceAfter=No 3 . . PUNCT Z:------------- _ 1 punct _ start_char=13|end_char=14|SpaceAfter=No
# generator = UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe
# udpipe_model = czech-pdt-ud-2.15-241121
# sent_id = 1
# text = Moničina kočka rozbila džbán.
1 Moničina Moničin ADJ AUFS1M--------- Case=Nom|Gender=Fem|Gender[psor]=Masc|NameType=Giv|Number=Sing|Poss=Yes 2 amod _ TokenRange=0:8
2 kočka kočka NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing 3 nsubj _ TokenRange=9:14
3 rozbila rozbít VERB VpQW----R-AAP-- Aspect=Perf|Gender=Fem,Neut|Number=Plur,Sing|Polarity=Pos|Tense=Past|VerbForm=Part|Voice=Act 0 root _ TokenRange=15:22
4 džbán džbán NOUN NNIS4-----A---- Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 3 obj _ SpaceAfter=No|TokenRange=23:28
5 . . PUNCT Z:------------- _ 3 punct _ SpaceAfter=No|TokenRange=28:29
# generator = UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe
# udpipe_model = slovak-snk-ud-2.15-241121
# newdoc
# newpar
# sent_id = 1
# text = Moničina mačka rozbila džbán.
1 Moničina moničin ADJ AFfs1x:r Case=Nom|Degree=Pos|Gender=Fem|Number=Sing 2 amod _ TokenRange=0:8
2 mačka mačka NOUN SSfs1 Case=Nom|Gender=Fem|Number=Sing 3 nsubj _ TokenRange=9:14
3 rozbila rozbiť VERB VLdscf+ Aspect=Perf|Gender=Fem|Number=Sing|Polarity=Pos|Tense=Past|VerbForm=Part 0 root _ TokenRange=15:22
4 džbán džbán NOUN SSis4 Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 3 obj _ SpaceAfter=No|TokenRange=23:28
5 . . PUNCT Z _ 3 punct _ SpaceAfter=No|TokenRange=28:29