Extract Kashubian from (English) Wiktionary
Perl in a notebook ftw
!wget https://dumps.wikimedia.org/plwiktionary/20211201/plwiktionary-20211201-pages-articles-multistream.xml.bz2
%%writefile kashubian.pl
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
#: (1.1) ''barzo [[kòmùs]] [[dosôdzëc]]'' → [[bardzo]] [[ktoś|komuś]] [[dokuczyć]]
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
my $reading = 0;
my $kashubian = 0;
while(<>) {
chomp;
if($reading == 1) {
if(/^\{\{/) {
$reading = 0;
$kashubian = 0;
next;
} else {
print "$_\n";
}
} else {
if(/^\{\{kolokacje\}\}/ && $kashubian == 1) {
$reading = 1;
next;
} elsif(/^\{\{przykłady\}\}/ && $kashubian == 1) {
$reading = 1;
next;
} elsif(/^==/) {
if(/\{\{język kaszubski\}\}/) {
$kashubian = 1;
} else {
$kashubian = 0;
}
} else {
next;
}
}
}
!bzcat plwiktionary-20211201-pages-articles-multistream.xml.bz2 | perl kashubian.pl > kashubian.txt