Generate seanchló data
Creating synthetic data for training
FONT_LINKS = """
https://web.archive.org/web/20130404225932/http://www.iol.ie/~sob/gadelica.zip
http://web.archive.org/web/20120830224624/http://www.gaelchlo.com/bunardgc.zip
http://web.archive.org/web/20220322030900/https://www.gaelchlo.com/bunargc.zip
http://web.archive.org/web/20120830224559/http://www.gaelchlo.com/bunartgc.zip
http://web.archive.org/web/20191216105737/http://www.gaelchlo.com/bundgc.zip
http://web.archive.org/web/20120830224343/http://www.gaelchlo.com/bunnod.zip
http://web.archive.org/web/20191218121436/http://www.gaelchlo.com/buntgc.zip
http://web.archive.org/web/20160408212120/http://gaelchlo.com/seandgc.zip
http://web.archive.org/web/20160408212013/http://gaelchlo.com/seantgc.zip
http://web.archive.org/web/20220322030900/https://www.gaelchlo.com/bunargc.zip
http://web.archive.org/web/20240119141814/https://www.gaelchlo.com/bungc.zip
http://web.archive.org/web/20160408212120/http://gaelchlo.com/seandgc.zip
http://web.archive.org/web/20240119143113/https://www.gaelchlo.com/seangc.zip
http://web.archive.org/web/20160408212013/http://gaelchlo.com/seantgc.zip
http://web.archive.org/web/20240619180031/http://www.gaelchlo.com/urgc.zip
http://web.archive.org/web/20240119141944/https://www.gaelchlo.com/aongc.zip
http://web.archive.org/web/20240619182847/https://www.gaelchlo.com/ardeorgc.zip
http://web.archive.org/web/20240119142032/https://www.gaelchlo.com/ardgc.zip
http://web.archive.org/web/20240119142040/https://www.gaelchlo.com/barrgc.zip
http://web.archive.org/web/20240119142035/https://www.gaelchlo.com/casgc.zip
http://web.archive.org/web/20240619182918/https://www.gaelchlo.com/corrgc.zip
http://web.archive.org/web/20240119142042/https://www.gaelchlo.com/deasgc.zip
http://web.archive.org/web/20240119142045/https://www.gaelchlo.com/dluthgc.zip
http://web.archive.org/web/20240119142026/https://www.gaelchlo.com/dubhgc.zip
http://web.archive.org/web/20240119142025/https://www.gaelchlo.com/fiorgc.zip
http://web.archive.org/web/20240119142051/https://www.gaelchlo.com/geargc.zip
http://web.archive.org/web/20240119142055/https://www.gaelchlo.com/glangc.zip
http://web.archive.org/web/20240119143123/https://www.gaelchlo.com/langc.zip
http://web.archive.org/web/20240619183031/https://www.gaelchlo.com/lomgc.zip
http://web.archive.org/web/20240619183107/https://www.gaelchlo.com/meargc.zip
http://web.archive.org/web/20240619183125/https://www.gaelchlo.com/mingc.zip
"""
FONTS = [x for x in FONT_LINKS.split("\n") if x != ""]
for font in FONTS:
!wget {font}
!unzip {font.split("/")[-1]}
!mkdir fonts
!mv *.ttf fonts
%%capture
!apt install python3-fontforge
%%capture
!apt install lynx
!lynx -dump https://www.gaelchlo.com/clonna1.html|grep html|grep '[12][0-9]\.'|awk '{print $2}'|while read i;do lynx -dump $i|grep zip;done|grep https|awk '{print $2}'|sort|uniq > fontlist
!wget -i fontlist
%%capture
!for i in *zip;do unzip -o $i;done
!cat fontlist|sort|uniq|grep -v 'w.zip$'|awk '{print "http://web.archive.org/web/" $0}' > iafonts
#!wget -i iafonts -o iafonts.log
from pathlib import Path
import fontforge
for otf in Path(".").glob("**/*.otf"):
font = fontforge.open(str(otf))
outname = "fonts/" + otf.stem + ".ttf"
font.generate(outname)
!wget https://web.archive.org/web/20240619184509/https://www.gaelchlo.com/miongc.zip
!wget http://web.archive.org/web/20240619184517/https://www.gaelchlo.com/morgc.zip
!wget http://web.archive.org/web/https://inkwell.ie/font-packs/Macalla/Macalla-Regular-ttf.zip
%%capture
!pip install trdg
%%capture
!wget https://raw.githubusercontent.com/tesseract-ocr/langdata/main/gle_uncial/gle_uncial.training_text
with open("gle_uncial.training_text", "r") as inf, open("training_text.txt", "w") as outf:
for line in inf.readlines():
if len(line.strip()) < 195:
outf.write(line.strip() + "\n")
!rm -rf out/
!trdg -c 10000 -i training_text.txt --name_format 2 --font_dir fonts
!find out -type f|zip seanchlo_generated.zip -@