!git clone https://github.com/UniversalDependencies/UD_Irish-IDT
Cloning into 'UD_Irish-IDT'...
remote: Enumerating objects: 32, done.
remote: Counting objects: 100% (32/32), done.
remote: Compressing objects: 100% (23/23), done.
remote: Total 328 (delta 14), reused 25 (delta 9), pack-reused 296
Receiving objects: 100% (328/328), 3.63 MiB | 12.73 MiB/s, done.
Resolving deltas: 100% (182/182), done.
!mkdir idt-json
!python -m spacy convert /content/UD_Irish-IDT/ga_idt-ud-train.conllu /content/idt-json
✔ Generated output file (2019 documents):
/content/idt-json/ga_idt-ud-train.json
!python -m spacy convert /content/UD_Irish-IDT/ga_idt-ud-dev.conllu /content/idt-json
✔ Generated output file (451 documents):
/content/idt-json/ga_idt-ud-dev.json
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz
!python -m spacy init-model ga /content/ga_vectors_cc --vectors-loc cc.ga.300.vec.gz
--2020-09-14 17:16:11--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 184422000 (176M) [binary/octet-stream]
Saving to: ‘cc.ga.300.vec.gz’

cc.ga.300.vec.gz    100%[===================>] 175.88M  44.2MB/s    in 4.0s    

2020-09-14 17:16:16 (43.8 MB/s) - ‘cc.ga.300.vec.gz’ saved [184422000/184422000]

✔ Successfully created model
316836it [00:27, 11398.56it/s]
✔ Loaded vectors from cc.ga.300.vec.gz
✔ Sucessfully compiled vocab
317041 entries, 316836 vectors

WikiANN is currently only available through Google Drive

from google.colab import drive
drive.mount('/gdrive')
Mounted at /gdrive
!cp /gdrive/My\ Drive/ga.tar.gz .
!tar zxvf ga.tar.gz
README.txt
wikiann-ga.bio
!wget http://downloads.dbpedia.org/links/resources/wikidatadump/2017-07-07/enwiki/20170701/enwiki-20170701-interlanguage-links_wikidataorg.ttl
--2020-09-14 17:15:11--  http://downloads.dbpedia.org/links/resources/wikidatadump/2017-07-07/enwiki/20170701/enwiki-20170701-interlanguage-links_wikidataorg.ttl
Resolving downloads.dbpedia.org (downloads.dbpedia.org)... 139.18.16.66
Connecting to downloads.dbpedia.org (downloads.dbpedia.org)|139.18.16.66|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1020894244 (974M) [text/turtle]
Saving to: ‘enwiki-20170701-interlanguage-links_wikidataorg.ttl’

enwiki-20170701-int 100%[===================>] 973.60M  18.7MB/s    in 54s     

2020-09-14 17:16:05 (18.1 MB/s) - ‘enwiki-20170701-interlanguage-links_wikidataorg.ttl’ saved [1020894244/1020894244]

!cat wikiann-ga.bio | awk '(NF == 7){print $6}'|sort|uniq|while read i;do grep "/$i>" enwiki-20170701-interlanguage-links_wikidataorg.ttl >> filtered;done
!pip install danlp 
Collecting danlp
  Downloading https://files.pythonhosted.org/packages/3c/79/96d0d3f3634ce75787d408383fa81cdd854552e27e4e279a985b511a6d88/danlp-0.0.9-py3-none-any.whl
Collecting pyconll
  Downloading https://files.pythonhosted.org/packages/2c/6e/c325d0db05ac1b8d45645de903e4ba691d419e861c915c3d4ebfcaf8ac25/pyconll-2.2.1-py3-none-any.whl
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from danlp) (4.41.1)
Requirement already satisfied: tweepy in /usr/local/lib/python3.6/dist-packages (from danlp) (3.6.0)
Requirement already satisfied: requests>=2.21 in /usr/local/lib/python3.6/dist-packages (from pyconll->danlp) (2.23.0)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy->danlp) (1.15.0)
Requirement already satisfied: PySocks>=1.5.7 in /usr/local/lib/python3.6/dist-packages (from tweepy->danlp) (1.7.1)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy->danlp) (1.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.21->pyconll->danlp) (2020.6.20)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.21->pyconll->danlp) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.21->pyconll->danlp) (1.24.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.21->pyconll->danlp) (3.0.4)
Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy->danlp) (3.1.0)
Installing collected packages: pyconll, danlp
Successfully installed danlp-0.0.9 pyconll-2.2.1
import danlp.datasets.wiki_ann
wa = danlp.datasets.wiki_ann._convert_wikiann_to_iob('wikiann-ga.bio', 'wikiann-ga.ner')
!head out
Colm _ _ B-PER
Ó _ _ I-PER
Ruairc _ _ I-PER

Seosamh _ _ B-PER
Ó _ _ I-PER
Cainín _ _ I-PER

Dónal _ _ B-PER
Ó _ _ I-PER
!python -m spacy convert -n 10 wikiann-ga.ner /content/idt-json/
ℹ Auto-detected token-per-line NER format
ℹ Grouping every 10 sentences into a document.
✔ Generated output file (757 documents):
/content/idt-json/wikiann-ga.json
!rm -rf models
!mkdir models
!python -m spacy train -v /content/ga_vectors_cc -p 'tagger,parser,ner' ga models idt-json/ga_idt-ud-train.json idt-json/ga_idt-ud-dev.json
Training pipeline: ['tagger', 'parser']
Starting with blank model 'ga'
Loading vector from model '/content/ga_vectors_cc'
Counting training words (limit=0)
/usr/lib/python3.6/runpy.py:193: UserWarning: [W022] Training a new part-of-speech tagger using a model with no lemmatization rules or data. This means that the trained model may not be able to lemmatize correctly. If this is intentional or the language you're using doesn't have lemmatization data, you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. If this is surprising, make sure you have the spacy-lookups-data package installed.
  "__main__", mod_spec)

Itn  Tag Loss    Tag %    Dep Loss    UAS     LAS    Token %  CPU WPS
---  ---------  --------  ---------  ------  ------  -------  -------
  1  14058.829    90.650  43482.222  74.804  56.787  100.000    11293
  2   6188.294    92.810  34097.493  79.836  66.009  100.000    11461
  3   4475.949    93.400  30061.441  81.314  69.572  100.000    11930
  4   3549.242    93.530  27752.841  82.784  71.759  100.000    11719
  5   2916.639    93.570  25861.771  83.066  72.401  100.000    11616
  6   2438.355    93.550  24533.545  83.133  72.726  100.000    12227
  7   2084.913    93.500  22901.218  83.281  73.043  100.000    11842
  8   1845.607    93.610  21836.129  83.516  73.346  100.000    12094
  9   1698.212    93.630  20626.109  83.555  73.507  100.000    11907
 10   1406.626    93.570  19251.761  83.712  73.978  100.000    11926
 11   1366.677    93.620  18882.570  83.896  74.128  100.000    12023
 12   1209.500    93.610  17836.598  83.968  74.177  100.000    11924
 13   1140.886    93.640  17341.624  84.098  74.375  100.000    11522
 14   1043.542    93.670  16748.375  83.992  74.292  100.000    11766
 15    926.876    93.700  15727.938  84.183  74.572  100.000    11931
 16    848.805    93.680  15002.112  84.059  74.427  100.000    11750
 17    857.415    93.760  14686.168  84.075  74.465  100.000    11724
 18    775.277    93.750  14028.872  84.091  74.603  100.000    11890
 19    651.078    93.680  13698.526  84.215  74.794  100.000    11932
 20    672.552    93.670  13036.999  84.356  74.879  100.000    11724
 21    590.244    93.670  12162.862  84.468  75.048  100.000    11851
 22    593.722    93.680  12494.905  84.441  75.122  100.000    11910
 23    582.541    93.660  12110.757  84.351  75.032  100.000    11544
 24    514.448    93.690  11635.750  84.232  74.879  100.000    11984
 25    491.457    93.640  10942.966  84.226  74.816  100.000    12106
 26    521.324    93.660  10958.952  84.232  74.779  100.000    12112
 27    507.717    93.650  10907.860  84.255  74.790  100.000    11754
 28    485.186    93.660  10149.477  84.143  74.666  100.000    11411
 29    507.038    93.720  10331.116  84.165  74.644  100.000    11740
 30    477.966    93.700   9649.121  84.300  74.891  100.000    11300
✔ Saved model to output directory
models/model-final
✔ Created best model
models/model-best
!mkdir modelout
!python -m spacy package --meta meta.json /content/models/model-best modelout
✔ Loaded meta.json from file
meta.json
✔ Successfully created package 'ga_idt_lg-1.0.0'
modelout/ga_idt_lg-1.0.0
To build the package, run `python setup.py sdist` in this directory.
import os
os.chdir('/content/modelout/ga_idt_lg-1.0.0')
!python setup.py sdist
running sdist
running egg_info
creating ga_idt_lg.egg-info
writing ga_idt_lg.egg-info/PKG-INFO
writing dependency_links to ga_idt_lg.egg-info/dependency_links.txt
writing requirements to ga_idt_lg.egg-info/requires.txt
writing top-level names to ga_idt_lg.egg-info/top_level.txt
writing manifest file 'ga_idt_lg.egg-info/SOURCES.txt'
reading manifest file 'ga_idt_lg.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'ga_idt_lg.egg-info/SOURCES.txt'
warning: sdist: standard file not found: should have one of README, README.rst, README.txt, README.md

running check
creating ga_idt_lg-1.0.0
creating ga_idt_lg-1.0.0/ga_idt_lg
creating ga_idt_lg-1.0.0/ga_idt_lg.egg-info
creating ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0
creating ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/parser
creating ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/tagger
creating ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/vocab
copying files to ga_idt_lg-1.0.0...
copying MANIFEST.in -> ga_idt_lg-1.0.0
copying meta.json -> ga_idt_lg-1.0.0
copying setup.py -> ga_idt_lg-1.0.0
copying ga_idt_lg/__init__.py -> ga_idt_lg-1.0.0/ga_idt_lg
copying ga_idt_lg/meta.json -> ga_idt_lg-1.0.0/ga_idt_lg
copying ga_idt_lg.egg-info/PKG-INFO -> ga_idt_lg-1.0.0/ga_idt_lg.egg-info
copying ga_idt_lg.egg-info/SOURCES.txt -> ga_idt_lg-1.0.0/ga_idt_lg.egg-info
copying ga_idt_lg.egg-info/dependency_links.txt -> ga_idt_lg-1.0.0/ga_idt_lg.egg-info
copying ga_idt_lg.egg-info/not-zip-safe -> ga_idt_lg-1.0.0/ga_idt_lg.egg-info
copying ga_idt_lg.egg-info/requires.txt -> ga_idt_lg-1.0.0/ga_idt_lg.egg-info
copying ga_idt_lg.egg-info/top_level.txt -> ga_idt_lg-1.0.0/ga_idt_lg.egg-info
copying ga_idt_lg/ga_idt_lg-1.0.0/meta.json -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0
copying ga_idt_lg/ga_idt_lg-1.0.0/tokenizer -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0
copying ga_idt_lg/ga_idt_lg-1.0.0/parser/cfg -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/parser
copying ga_idt_lg/ga_idt_lg-1.0.0/parser/model -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/parser
copying ga_idt_lg/ga_idt_lg-1.0.0/parser/moves -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/parser
copying ga_idt_lg/ga_idt_lg-1.0.0/tagger/cfg -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/tagger
copying ga_idt_lg/ga_idt_lg-1.0.0/tagger/model -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/tagger
copying ga_idt_lg/ga_idt_lg-1.0.0/tagger/tag_map -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/tagger
copying ga_idt_lg/ga_idt_lg-1.0.0/vocab/key2row -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/vocab
copying ga_idt_lg/ga_idt_lg-1.0.0/vocab/lexemes.bin -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/vocab
copying ga_idt_lg/ga_idt_lg-1.0.0/vocab/strings.json -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/vocab
copying ga_idt_lg/ga_idt_lg-1.0.0/vocab/vectors -> ga_idt_lg-1.0.0/ga_idt_lg/ga_idt_lg-1.0.0/vocab
Writing ga_idt_lg-1.0.0/setup.cfg
creating dist
Creating tar archive
removing 'ga_idt_lg-1.0.0' (and everything under it)
!cat /content/models/model-best/meta.json
{
  "lang":"ga",
  "name":"model",
  "version":"0.0.0",
  "spacy_version":">=2.2.4",
  "description":"",
  "author":"",
  "email":"",
  "url":"",
  "license":"",
  "vectors":{
    "width":300,
    "vectors":316836,
    "keys":316836,
    "name":"ga_model.vectors"
  },
  "pipeline":[
    "tagger",
    "parser"
  ],
  "factories":{
    "tagger":"tagger",
    "parser":"parser"
  },
  "labels":{
    "tagger":[
      "!",
      ".",
      "...",
      "?",
      "Abr",
      "Ad",
      "Adj",
      "Art",
      "CM",
      "CU",
      "Cmp",
      "Cmpd",
      "CmpdNoGen",
      "Comp",
      "Cond",
      "Coord",
      "Cop",
      "Cp",
      "Deg",
      "Dem",
      "Det",
      "Dir",
      "Foreign",
      "FutInd",
      "Gn",
      "Idf",
      "Imper",
      "Inf",
      "Item",
      "Itj",
      "Its",
      "Loc",
      "Nm",
      "Noun",
      "Num",
      "PastImp",
      "PastInd",
      "Pat",
      "Pers",
      "Poss",
      "Prep",
      "PresImp",
      "PresInd",
      "PresSubj",
      "Pron",
      "Punct",
      "Q",
      "Ref",
      "Rel",
      "Simp",
      "Subord",
      "Subst",
      "Sup",
      "Temp",
      "Unknown",
      "VD",
      "VI",
      "VT",
      "VTI",
      "Vb",
      "Voc",
      "Web",
      "_SP",
      "cionn"
    ],
    "parser":[
      "ROOT",
      "acl:relcl",
      "advcl",
      "advmod",
      "amod",
      "appos",
      "case",
      "cc",
      "ccomp",
      "compound",
      "conj",
      "cop",
      "csubj:cleft",
      "csubj:cop",
      "dep",
      "det",
      "fixed",
      "flat",
      "flat:name",
      "mark",
      "mark:prt",
      "nmod",
      "nmod:poss",
      "nsubj",
      "nummod",
      "obj",
      "obl",
      "obl:prep",
      "obl:tmod",
      "parataxis",
      "punct",
      "xcomp",
      "xcomp:pred"
    ]
  },
  "accuracy":{
    "tags_acc":92.23,
    "token_acc":100.0,
    "las":68.3640850205,
    "uas":80.5899837362,
    "las_per_type":{
      "nummod":{
        "p":70.0,
        "r":61.5384615385,
        "f":65.4970760234
      },
      "root":{
        "p":88.0266075388,
        "r":88.0266075388,
        "f":88.0266075388
      },
      "case":{
        "p":88.8535031847,
        "r":91.7763157895,
        "f":90.2912621359
      },
      "obl":{
        "p":47.0031545741,
        "r":54.9815498155,
        "f":50.6802721088
      },
      "mark:prt":{
        "p":71.1538461538,
        "r":81.9620253165,
        "f":76.1764705882
      },
      "ccomp":{
        "p":40.2777777778,
        "r":47.5409836066,
        "f":43.6090225564
      },
      "nsubj":{
        "p":75.1824817518,
        "r":79.7213622291,
        "f":77.3854244929
      },
      "obj":{
        "p":55.5555555556,
        "r":49.2957746479,
        "f":52.2388059701
      },
      "nmod":{
        "p":52.912142152,
        "r":54.8618219038,
        "f":53.8693467337
      },
      "mark":{
        "p":82.7715355805,
        "r":72.6973684211,
        "f":77.408056042
      },
      "xcomp":{
        "p":60.4743083004,
        "r":65.3846153846,
        "f":62.8336755647
      },
      "acl:relcl":{
        "p":47.2602739726,
        "r":53.488372093,
        "f":50.1818181818
      },
      "xcomp:pred":{
        "p":44.0476190476,
        "r":59.6774193548,
        "f":50.6849315068
      },
      "amod":{
        "p":57.5438596491,
        "r":54.3046357616,
        "f":55.8773424191
      },
      "det":{
        "p":92.8480204342,
        "r":94.0491591203,
        "f":93.4447300771
      },
      "csubj:cleft":{
        "p":47.2222222222,
        "r":27.4193548387,
        "f":34.693877551
      },
      "obl:prep":{
        "p":77.6041666667,
        "r":65.6387665198,
        "f":71.1217183771
      },
      "advcl":{
        "p":54.4,
        "r":49.2753623188,
        "f":51.711026616
      },
      "parataxis":{
        "p":42.4242424242,
        "r":27.4509803922,
        "f":33.3333333333
      },
      "nmod:poss":{
        "p":73.4939759036,
        "r":75.3086419753,
        "f":74.3902439024
      },
      "cc":{
        "p":78.9473684211,
        "r":79.5454545455,
        "f":79.2452830189
      },
      "conj":{
        "p":42.7609427609,
        "r":42.0529801325,
        "f":42.4040066778
      },
      "dep":{
        "p":0.0,
        "r":0.0,
        "f":0.0
      },
      "compound":{
        "p":75.0,
        "r":26.0869565217,
        "f":38.7096774194
      },
      "flat":{
        "p":64.1025641026,
        "r":64.9350649351,
        "f":64.5161290323
      },
      "cop":{
        "p":69.3251533742,
        "r":70.625,
        "f":69.9690402477
      },
      "flat:name":{
        "p":63.4782608696,
        "r":51.4084507042,
        "f":56.8093385214
      },
      "obl:tmod":{
        "p":66.6666666667,
        "r":2.7397260274,
        "f":5.2631578947
      },
      "advmod":{
        "p":66.2745098039,
        "r":65.0,
        "f":65.6310679612
      },
      "appos":{
        "p":21.9512195122,
        "r":20.9302325581,
        "f":21.4285714286
      },
      "flat:foreign":{
        "p":0.0,
        "r":0.0,
        "f":0.0
      },
      "fixed":{
        "p":74.7663551402,
        "r":61.0687022901,
        "f":67.2268907563
      },
      "csubj:cop":{
        "p":62.5,
        "r":55.5555555556,
        "f":58.8235294118
      },
      "discourse":{
        "p":0.0,
        "r":0.0,
        "f":0.0
      },
      "case:voc":{
        "p":0.0,
        "r":0.0,
        "f":0.0
      },
      "vocative":{
        "p":0.0,
        "r":0.0,
        "f":0.0
      }
    }
  },
  "speed":{
    "cpu":13038.7132631094,
    "gpu":null,
    "nwords":10000
  }
}
import os
os.chdir('/content')
!rm -rf modelout
!mkdir modelout
!rm meta.json
!cat meta.json
{
  "name": "ga_idt_sm",
  "lang": "ga",
  "version": "1.0.0",
  "spacy_version": ">=2.0.0,<3.0.0",
  "description": "Irish model for spaCy trained on IDT",
  "author": "Jim O'Regan",
  "email": "jaoregan@tcd.ie",
  "license": "CC BY-SA 3.0",
  "pipeline": ["tagger", "parser", "ner"]
}