English hyphenation from Wiktionary
Part of the KTH application assignment
Original on Kaggle
%%capture
!wget https://dumps.wikimedia.org/enwiktionary/20210620/enwiktionary-20210620-pages-articles-multistream.xml.bz2
!bzcat enwiktionary-20210620-pages-articles-multistream.xml.bz2|grep 'hyphenation|en' > /tmp/rawhyph
!grep '{{a|U.S.' /tmp/rawhyph|sed -e 's/{a|U.S.}//;s/{}//'
!cat /tmp/rawhyph|sed -e 's/{a|U.S.}//;s/{}//;'|sed -e "s/''':'''/|/g"|awk -F'{{hyphenation\|en\|' '{print $2}'|awk -F'}}' '{print $1}'|perl -ane 'chomp;@l=split/\|/;if($l[0] =~ /=/){shift @l};if($l[$#l] =~ /=/){pop @l};print join("", @l) . "\t" . join(" ", @l). "\n"'|sort|uniq > hyphenation.tsv