Training MFA on fuaimeanna.ie
Training Monreal Forced Aligner using fuaimeanna.ie data on Kaggle
%%capture
import os
os.chdir('/tmp')
!wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
!tar zxvf montreal-forced-aligner_linux.tar.gz
!ln -s /tmp/montreal-forced-aligner/lib/libpython3.6m.so.1.0 /tmp/montreal-forced-aligner/lib/libpython3.6m.so
os.chdir('/kaggle/working')
os.environ['LD_LIBRARY_PATH'] = f'{os.environ["LD_LIBRARY_PATH"]}:/tmp/montreal-forced-aligner/lib/'
os.environ['PATH'] = f'{os.environ["PATH"]}:/tmp/montreal-forced-aligner/bin/'
%%capture
!apt-get -y install libgfortran3
To create the same data, fork and run this notebook
!mkdir /tmp/m
!mkdir /tmp/c
!mkdir /tmp/u
!cp ../input/scrape-fuaimeanna-private/wav/*s1.wav /tmp/u
!cp ../input/scrape-fuaimeanna-private/wav/*s2.wav /tmp/m
!cp ../input/scrape-fuaimeanna-private/wav/*s3.wav /tmp/c
%%writefile fuaimeanna-write.pl
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my %cr_files = (
'mo shmidiú' => 'mo chuid smidiú',
'mo shmior' => 'mo chuid smior',
'mo shmólach' => 'mo smólach',
'shmachtaigh' => 'smachtaigh',
'shmaoinigh' => 'smaoinigh',
'shmear' => 'smear',
'deamhain' => 'diabhail',
'folach' => 'i bhfolach',
'captaen' => 'caiptín',
'oirthe' => 'feilte',
);
my %empty = (
'/sounds/gob_i3_s3.mp3' => 1,
'/sounds/iioctha_i3_s3.mp3' => 1,
'/sounds/mo_shuiiochaan_i3_s3.mp3' => 1,
'/sounds/riail_i3_s3.mp3' => 1
);
open(LEXM, '>>', '/tmp/lexicon-munster.raw');
binmode LEXM, ':utf8';
open(LEXU, '>>', '/tmp/lexicon-ulster.raw');
binmode LEXU, ':utf8';
open(LEXC, '>>', '/tmp/lexicon-connaught.raw');
binmode LEXC, ':utf8';
sub write_text {
my $file = shift;
my $text = shift;
open(OUTF, '>>', $file);
binmode OUTF, ':utf8';
print OUTF $text;
close OUTF;
}
sub write_pron {
my $file = shift;
my $text = shift;
my $pron = shift;
if ($text eq 'ar tí') {
$pron =~ s/ \. ˈ / # /g;
}
$pron =~ s/ [ˈˌ] / /g;
$pron =~ s/^[ˈˌ] //g;
$pron =~ s/ \. / /g;
my @words = split/ /, $text;
my @prons = split/ \# /, $pron;
if($#words != $#prons) {
print STDERR "ERROR: $file $text $pron\n";
}
if($#words == 0) {
print $file "$text $pron\n";
} else {
for(my $i = 0; $i <= $#words; $i++) {
print $file "$words[$i] $prons[$i]\n";
}
}
}
while(<STDIN>) {
chomp;
my @line = split/\t/;
next if($line[0] eq 'Orthographic');
my $text = lc($line[0]);
next if($line[0] eq "d'fhág");
my $uout = $line[1];
$uout =~ s!/sounds/!!;
$uout =~ s/\.mp3$/.txt/;
my $cout = $line[3];
$cout =~ s!/sounds/!!;
$cout =~ s/\.mp3$/.txt/;
my $mout = $line[5];
$mout =~ s!/sounds/!!;
$mout =~ s/\.mp3$/.txt/;
$uout = '/tmp/u/' . $uout;
$cout = '/tmp/c/' . $cout;
$mout = '/tmp/m/' . $mout;
my $pronu = $line[2];
my $pronc = $line[4];
my $pronm = $line[6];
if($text eq 'Gaeilge') {
write_text($uout, "gaeilic");
write_text($cout, "gaeilge");
write_text($mout, "gaelainn");
write_pron(\*LEXU, "gaeilic", $pronu);
write_pron(\*LEXC, "gaeilge", $pronc);
write_pron(\*LEXM, "gaelainn", $pronm);
next;
}
if($line[0] eq 'bocht' || $line[0] eq 'teacht' || $line[0] eq 'teocht') {
$pronu =~ s/x t̪ˠ/ɾˠ t̪ˠ/;
}
write_text($uout, $text);
write_pron(\*LEXU, $text, $pronu);
write_text($mout, $text);
write_pron(\*LEXM, $text, $pronm);
if(!exists $empty{$line[3]}) {
my $cfix = exists $cr_files{$text} ? $cr_files{$text} : $text;
write_text($cout, $cfix);
write_pron(\*LEXC, $cfix, $pronc);
}
}
!cat ../input/scrape-fuaimeanna-private/all-fuaimeanna-data.tsv | perl fuaimeanna-write.pl
!cat /tmp/lexicon-connaught.raw | sort | uniq > /tmp/lexicon-connaught.txt
!cat /tmp/lexicon-ulster.raw | sort | uniq > /tmp/lexicon-ulster.txt
!cat /tmp/lexicon-munster.raw | sort | uniq > /tmp/lexicon-munster.txt
!cat /tmp/lexicon-connaught.raw /tmp/lexicon-ulster.raw /tmp/lexicon-munster.raw | sort | uniq > /tmp/lexicon-all.txt
!mkdir /tmp/all
!cp /tmp/c/* /tmp/all
!cp /tmp/m/* /tmp/all
!cp /tmp/u/* /tmp/all
!mkdir /tmp/mfa-temp
!mfa_train_and_align -t /tmp/mfa-temp -o ./munster-model /tmp/m /tmp/lexicon-munster.txt /tmp/textgrid-munster
!mfa_train_and_align -t /tmp/mfa-temp -o ./ulster-model /tmp/u /tmp/lexicon-ulster.txt /tmp/textgrid-ulster
!mfa_train_and_align -t /tmp/mfa-temp -o ./connaught-model /tmp/c /tmp/lexicon-connaught.txt /tmp/textgrid-connaught
!mfa_train_and_align -t /tmp/mfa-temp -o ./all-model /tmp/all /tmp/lexicon-all.txt /tmp/textgrid-all
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-ulster.txt ./g2p-ulster
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-munster.txt ./g2p-munster
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-connaught.txt ./g2p-connaught
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-all.txt ./g2p-all