Training spaCy on IDT
I've forgotten where I put the output model, though
!git clone https://github.com/UniversalDependencies/UD_Irish-IDT
!mkdir idt-json
!python -m spacy convert /content/UD_Irish-IDT/ga_idt-ud-train.conllu /content/idt-json
!python -m spacy convert /content/UD_Irish-IDT/ga_idt-ud-dev.conllu /content/idt-json
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz
!python -m spacy init-model ga /content/ga_vectors_cc --vectors-loc cc.ga.300.vec.gz
WikiANN is currently only available through Google Drive
from google.colab import drive
drive.mount('/gdrive')
!cp /gdrive/My\ Drive/ga.tar.gz .
!tar zxvf ga.tar.gz
!wget http://downloads.dbpedia.org/links/resources/wikidatadump/2017-07-07/enwiki/20170701/enwiki-20170701-interlanguage-links_wikidataorg.ttl
!cat wikiann-ga.bio | awk '(NF == 7){print $6}'|sort|uniq|while read i;do grep "/$i>" enwiki-20170701-interlanguage-links_wikidataorg.ttl >> filtered;done
!pip install danlp
import danlp.datasets.wiki_ann
wa = danlp.datasets.wiki_ann._convert_wikiann_to_iob('wikiann-ga.bio', 'wikiann-ga.ner')
!head out
!python -m spacy convert -n 10 wikiann-ga.ner /content/idt-json/
!rm -rf models
!mkdir models
!python -m spacy train -v /content/ga_vectors_cc -p 'tagger,parser,ner' ga models idt-json/ga_idt-ud-train.json idt-json/ga_idt-ud-dev.json
!mkdir modelout
!python -m spacy package --meta meta.json /content/models/model-best modelout
import os
os.chdir('/content/modelout/ga_idt_lg-1.0.0')
!python setup.py sdist
!cat /content/models/model-best/meta.json
import os
os.chdir('/content')
!rm -rf modelout
!mkdir modelout
!rm meta.json
!cat meta.json