Part, the first

Setting up MFA

%%capture
import os
os.chdir('/tmp')
!wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
!tar zxvf montreal-forced-aligner_linux.tar.gz
!ln -s /tmp/montreal-forced-aligner/lib/libpython3.6m.so.1.0 /tmp/montreal-forced-aligner/lib/libpython3.6m.so
os.chdir('/kaggle/working')
os.environ['LD_LIBRARY_PATH'] = f'{os.environ["LD_LIBRARY_PATH"]}:/tmp/montreal-forced-aligner/lib/'
os.environ['PATH'] = f'{os.environ["PATH"]}:/tmp/montreal-forced-aligner/bin/'
%%capture
!apt-get -y install libgfortran3

To create the same data, fork and run this notebook

!mkdir /tmp/m
!mkdir /tmp/c
!mkdir /tmp/u

!cp ../input/scrape-fuaimeanna-private/wav/*s1.wav /tmp/u
!cp ../input/scrape-fuaimeanna-private/wav/*s2.wav /tmp/m
!cp ../input/scrape-fuaimeanna-private/wav/*s3.wav /tmp/c
%%writefile fuaimeanna-write.pl
#!/usr/bin/perl
use warnings;
use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my %cr_files = (
	'mo shmidiú' => 'mo chuid smidiú',
	'mo shmior' => 'mo chuid smior',
	'mo shmólach' => 'mo smólach',
	'shmachtaigh' => 'smachtaigh',
	'shmaoinigh' => 'smaoinigh',
	'shmear' => 'smear',
	'deamhain' => 'diabhail',
	'folach' => 'i bhfolach',
	'captaen' => 'caiptín',
	'oirthe' => 'feilte',
);
my %empty = (
	'/sounds/gob_i3_s3.mp3' => 1,
	'/sounds/iioctha_i3_s3.mp3' => 1,
	'/sounds/mo_shuiiochaan_i3_s3.mp3' => 1,
	'/sounds/riail_i3_s3.mp3' => 1
);

open(LEXM, '>>', '/tmp/lexicon-munster.raw');
binmode LEXM, ':utf8';
open(LEXU, '>>', '/tmp/lexicon-ulster.raw');
binmode LEXU, ':utf8';
open(LEXC, '>>', '/tmp/lexicon-connaught.raw');
binmode LEXC, ':utf8';

sub write_text {
	my $file = shift;
	my $text = shift;
	open(OUTF, '>>', $file);
	binmode OUTF, ':utf8';
	print OUTF $text;
	close OUTF;
}

sub write_pron {
	my $file = shift;
	my $text = shift;
	my $pron = shift;
	if ($text eq 'ar tí') {
		$pron =~ s/ \. ˈ / # /g;
	}
	$pron =~ s/ [ˈˌ] / /g;
	$pron =~ s/^[ˈˌ] //g;
	$pron =~ s/ \. / /g;
	my @words = split/ /, $text;
	my @prons = split/ \# /, $pron;
	if($#words != $#prons) {
		print STDERR "ERROR: $file $text $pron\n";
	}
	if($#words == 0) {
		print $file "$text $pron\n";
	} else {
		for(my $i = 0; $i <= $#words; $i++) {
			print $file "$words[$i] $prons[$i]\n";
		}
	}
}

while(<STDIN>) {
	chomp;
	my @line = split/\t/;
	next if($line[0] eq 'Orthographic');
	my $text = lc($line[0]);
	next if($line[0] eq "d'fhág");
	my $uout = $line[1];
	$uout =~ s!/sounds/!!;
	$uout =~ s/\.mp3$/.txt/;
	my $cout = $line[3];
	$cout =~ s!/sounds/!!;
	$cout =~ s/\.mp3$/.txt/;
	my $mout = $line[5];
	$mout =~ s!/sounds/!!;
	$mout =~ s/\.mp3$/.txt/;
	$uout = '/tmp/u/' . $uout;
	$cout = '/tmp/c/' . $cout;
	$mout = '/tmp/m/' . $mout;

	my $pronu = $line[2];
	my $pronc = $line[4];
	my $pronm = $line[6];

	if($text eq 'Gaeilge') {
		write_text($uout, "gaeilic");
		write_text($cout, "gaeilge");
		write_text($mout, "gaelainn");
		write_pron(\*LEXU, "gaeilic", $pronu);
		write_pron(\*LEXC, "gaeilge", $pronc);
		write_pron(\*LEXM, "gaelainn", $pronm);
		next;
	}
	if($line[0] eq 'bocht' || $line[0] eq 'teacht' || $line[0] eq 'teocht') {
		$pronu =~ s/x t̪ˠ/ɾˠ t̪ˠ/;
	}
	write_text($uout, $text);
	write_pron(\*LEXU, $text, $pronu);
	write_text($mout, $text);
	write_pron(\*LEXM, $text, $pronm);
	if(!exists $empty{$line[3]}) {
		my $cfix = exists $cr_files{$text} ? $cr_files{$text} : $text;
		write_text($cout, $cfix);
		write_pron(\*LEXC, $cfix, $pronc);
	}
}
Writing fuaimeanna-write.pl
!cat ../input/scrape-fuaimeanna-private/all-fuaimeanna-data.tsv | perl fuaimeanna-write.pl
!cat /tmp/lexicon-connaught.raw | sort | uniq > /tmp/lexicon-connaught.txt
!cat /tmp/lexicon-ulster.raw | sort | uniq > /tmp/lexicon-ulster.txt
!cat /tmp/lexicon-munster.raw | sort | uniq > /tmp/lexicon-munster.txt
!cat /tmp/lexicon-connaught.raw /tmp/lexicon-ulster.raw /tmp/lexicon-munster.raw | sort | uniq > /tmp/lexicon-all.txt
!mkdir /tmp/all
!cp /tmp/c/* /tmp/all
!cp /tmp/m/* /tmp/all
!cp /tmp/u/* /tmp/all
!mkdir /tmp/mfa-temp

Run MFA

!mfa_train_and_align -t /tmp/mfa-temp -o ./munster-model /tmp/m /tmp/lexicon-munster.txt /tmp/textgrid-munster
!mfa_train_and_align -t /tmp/mfa-temp -o ./ulster-model /tmp/u /tmp/lexicon-ulster.txt /tmp/textgrid-ulster
!mfa_train_and_align -t /tmp/mfa-temp -o ./connaught-model /tmp/c /tmp/lexicon-connaught.txt /tmp/textgrid-connaught
!mfa_train_and_align -t /tmp/mfa-temp -o ./all-model /tmp/all /tmp/lexicon-all.txt /tmp/textgrid-all
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-ulster.txt ./g2p-ulster
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-munster.txt ./g2p-munster
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-connaught.txt ./g2p-connaught
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-all.txt ./g2p-all