Irish IPA and alternatives from enwiktionary
Slight modification of earlier notebook
!wget https://dumps.wikimedia.org/enwiktionary/20211101/enwiktionary-20211101-pages-articles.xml.bz2
%%writefile extract-ipa.pl
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $title = '';
while(<>) {
chomp;
if(/<title>([^<]*)<\/title>/) {
$title = $1;
}
if(m!\{\{IPA\|ga\|!) {
print "$title\t$_\n";
}
}
!bzcat enwiktionary-20211101-pages-articles.xml.bz2|perl extract-ipa.pl > wikt-ipa.txt
@phdthesis{hughes1986gaelic,
title={The gaelic of Tangaveane and Commeen, County Donegal (texts, phonology, aspects of grammar and a vocabulary).},
author={Hughes, Arthur John},
year={1986},
school={Queen's University of Belfast}
}
%%writefile extract-ulster.pl
#!/usr/bin/perl
while(<>) {
chomp;
print "# $_\n";
if(/^([^\t]+)\t\* ?\{\{a\|[^}]+\}\} \{\{IPA\|ga\|([^|]+)\|qual1=before \{\{m\|ga\|sé\}\}, \{\{m\|ga\|sí\}\}, \{\{m\|ga\|sibh\}\}, \{\{m\|ga\|siad\}\}\|\/([^\/]+)\/\|qual2=elsewhere\}\}/) {
print "$1\t$2\t\t\t\tbefore sé, sí, sibh, siad\n";
print "$1\t$3\t\t\t\telsewhere\n";
} elsif(/^([^\t]+)\t\* \{\{a\|([^}]+)\}\} \{\{IPA\|ga\|([^}]+)\}\}$/) {
my $word = $1;
my $dial = $2;
my $pron = $3;
$pron =~ s/\///g;
if($dial eq 'Ulster') {
$dial = "";
} else {
$dial = "\t\t\t\t\t\t\t\t\t\t$dial";
}
if($pron =~ /\|/) {
for my $pp (split(/\|/, $pron)) {
print "$word\t$pp" . $dial . "\n";
}
} else {
$pron =~ s/\[//g;
$pron =~ s/\]//g;
print "$word\t$pron" . $dial . "\n";
}
}
}
%%writefile extract-alt-form.pl
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $title = '';
my $polish_seen = 0;
while(<>) {
chomp;
if(/<title>([^<]*)<\/title>/) {
$title = $1;
}
if(m!\{\{alternative form of\|ga\|!) {
print "$title\t$_\n";
}
}
!bzcat enwiktionary-20211101-pages-articles.xml.bz2|perl extract-alt-form.pl > wikt-alts.txt