!wget https://dumps.wikimedia.org/enwiktionary/20211101/enwiktionary-20211101-pages-articles.xml.bz2
--2021-11-02 17:31:36--  https://dumps.wikimedia.org/enwiktionary/20211101/enwiktionary-20211101-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 892543646 (851M) [application/octet-stream]
Saving to: ‘enwiktionary-20211101-pages-articles.xml.bz2’

enwiktionary-202111 100%[===================>] 851.20M  4.25MB/s    in 3m 21s  

2021-11-02 17:34:58 (4.23 MB/s) - ‘enwiktionary-20211101-pages-articles.xml.bz2’ saved [892543646/892543646]

%%writefile extract-ipa.pl
#!/usr/bin/perl

use warnings;
use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my $title = '';
while(<>) {
    chomp;
    if(/<title>([^<]*)<\/title>/) {
        $title = $1;
    }
    if(m!\{\{IPA\|ga\|!) {
        print "$title\t$_\n";
    }
}
Writing extract-ipa.pl
!bzcat enwiktionary-20211101-pages-articles.xml.bz2|perl extract-ipa.pl > wikt-ipa.txt
@phdthesis{hughes1986gaelic,
  title={The gaelic of Tangaveane and Commeen, County Donegal (texts, phonology, aspects of grammar and a vocabulary).},
  author={Hughes, Arthur John},
  year={1986},
  school={Queen's University of Belfast}
}
%%writefile extract-ulster.pl
#!/usr/bin/perl

while(<>) {
	chomp;
	print "# $_\n";
	if(/^([^\t]+)\t\* ?\{\{a\|[^}]+\}\} \{\{IPA\|ga\|([^|]+)\|qual1=before \{\{m\|ga\|\}\}, \{\{m\|ga\|\}\}, \{\{m\|ga\|sibh\}\}, \{\{m\|ga\|siad\}\}\|\/([^\/]+)\/\|qual2=elsewhere\}\}/) {
		print "$1\t$2\t\t\t\tbefore sé, sí, sibh, siad\n";
		print "$1\t$3\t\t\t\telsewhere\n";
	} elsif(/^([^\t]+)\t\* \{\{a\|([^}]+)\}\} \{\{IPA\|ga\|([^}]+)\}\}$/) {
		my $word = $1;
		my $dial = $2;
		my $pron = $3;
		$pron =~ s/\///g;

		if($dial eq 'Ulster') {
			$dial = "";
		} else {
			$dial = "\t\t\t\t\t\t\t\t\t\t$dial";
		}

		if($pron =~ /\|/) {
			for my $pp (split(/\|/, $pron)) {
				print "$word\t$pp" . $dial . "\n";
			}
		} else {
			$pron =~ s/\[//g;
			$pron =~ s/\]//g;
			print "$word\t$pron" . $dial . "\n";
		}
	}
}
%%writefile extract-alt-form.pl
#!/usr/bin/perl

use warnings;
use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my $title = '';
my $polish_seen = 0;
while(<>) {
    chomp;
    if(/<title>([^<]*)<\/title>/) {
        $title = $1;
    }
    if(m!\{\{alternative form of\|ga\|!) {
        print "$title\t$_\n";
    }
}
Writing extract-alt-form.pl
!bzcat enwiktionary-20211101-pages-articles.xml.bz2|perl extract-alt-form.pl > wikt-alts.txt