The first few cells set up phonetisaurus; they are adapted from the instructions in the git README.

%%capture
!apt-get -y install git g++ autoconf-archive make libtool
# Python bindings
!apt-get -y install python-setuptools python-dev
# mitlm (to build a quick play model)
!apt-get -y install gfortran

%%capture
!wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.2.tar.gz
!tar -xvzf openfst-1.6.2.tar.gz
%cd openfst-1.6.2
# Minimal configure, compatible with current defaults for Kaldi
!./configure --enable-static --enable-shared --enable-far --enable-ngram-fsts
!make -j 4
# Now wait a while...
!make install

import os
ldlibpath = os.environ['LD_LIBRARY_PATH']
#_STORED_LD = "/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
newld = f'{ldlibpath}:/usr/local/lib:/usr/local/lib/fst'
os.environ['LD_LIBRARY_PATH']=newld
%env LD_LIBRARY_PATH

'/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/lib:/usr/local/lib/fst'

%%capture
%cd /content
!git clone https://github.com/AdolfVonKleist/Phonetisaurus.git
%cd Phonetisaurus
!./configure
!make
!make install

%cd /content/

/content

We also need MITLM

%%capture
!git clone https://github.com/mitlm/mitlm
%cd mitlm
!autoreconf -i
!./configure
!make
!make install

%cd /content

/content

The TIMIT dictionary is relatively clean, so there are only a few small changes that are needed for phonetisaurus.

!cat TIMITDIC.txt|grep -v '^;'|tr -d '/'|sed -e 's/  */ /g;s/~adj//;s/~v_past//;s/~v_pres//;s/~v//;s/~n//;' > TIMIT.cleaned
!cat TIMIT.cleaned | perl -pe 's/\s+/ /g; s/^\s+//; s/\s+$//; @_ = split (/\s+/); $w = shift (@_); $_ = $w."\t".join (" ", @_)."\n";' > TIMIT.clean

!phonetisaurus-align --input=TIMIT.clean --ofile=TIMIT.clean.corpus --seq1_del=false

GitRevision: 0.9.1
Loading input file: TIMIT.clean
Alignment failed: x
Starting EM...
Finished first iter...
Iteration: 1 Change: 2.70318
Iteration: 2 Change: 0.0603504
Iteration: 3 Change: 0.0425539
Iteration: 4 Change: 0.0206814
Iteration: 5 Change: 0.0114756
Iteration: 6 Change: 0.00711536
Iteration: 7 Change: 0.0042429
Iteration: 8 Change: 0.00297546
Iteration: 9 Change: 0.00223923
Iteration: 10 Change: 0.00151825
Iteration: 11 Change: 0.00115204
Last iteration: 
0.001	Loading corpus TIMIT.clean.corpus...
0.037	Smoothing[1] = ModKN
0.037	Smoothing[2] = ModKN
0.037	Smoothing[3] = ModKN
0.037	Smoothing[4] = ModKN
0.037	Smoothing[5] = ModKN
0.037	Smoothing[6] = ModKN
0.037	Smoothing[7] = ModKN
0.037	Smoothing[8] = ModKN
0.037	Set smoothing algorithms...
0.037	Y 6.063492e-01
0.037	Y 6.304450e-01
0.037	Y 7.305669e-01
0.037	Y 7.950124e-01
0.037	Y 8.524463e-01
0.038	Y 9.033717e-01
0.038	Y 9.355036e-01
0.038	Y 9.092702e-01
0.038	Estimating full n-gram model...
0.040	Saving LM to timit.arpa...
GitRevision: 0.9.1
Initializing...
Converting...

That thing I just said about the TIMIT dictionary being relatively clean? Nah. There are some errors, particularly with 'c' being transcribed as 'ao' (which is a vowel sound). Also, the default output of phonetisaurus-align only does 1:1, 1:0, 0:1, 2:1, and 1:2 mappings of graphemes and phonemes, which means some of the alignments look quite strange.

%%writefile clean_ngrams.pl
#!/usr/bin/perl
# Fix some of the alignments from phonetisaurus-align to be more recognisable to humans
# Also fixes some transcription errors in the TIMIT dictionary (mostly c -> ao)

use warnings;
use strict;
use utf8;

my $raw_replacements = <<_HERE_;
e}_ l}el	e|l}el
e}_ d}ed	e|d}ed
e}_ d}d	e|d}d
e}iy1 e}_	e|e}iy1
i}ix o|n}n	i|o}ix n}n
r}_ t|-}r t}t	r}r t|-|t}t
-|k}n n|a}ae1	-}_ k|n}n a}ae1
a|c}ax c}k	a}ax c|c}k
c}k h}_	c|h}k
c}k q|u}w	c|q}k u}w
n}n|t c}s	n}n c}t|s
i|c}ih1 k|-}k	i}ih1 c|k}k -}_
a|k}ey1 e|-}k	a}ey1 k}k e|-}_
-|k}n n|a}ae2	-}_ k|n}n a}ae2
a|t}ax e}_ -|e}t y}ay1	a}ax t}t e}_ -}_ e|y}ay1
t|u}ch r}axr	t}ch u|r}axr
e}_ d}d	e|d}d
a}ae1 e}_	a|e}ae1
a}ih e}_	a|e}ih
-|c}ao	-}_ c}k
x}eh1|k -}s	x}eh1|k|s -}_
e}_ l|l}el	e|l|l}el
w|h}hh y}w|ay1	w|h}hh|w y}ay1
a|d}ax j|o}jh u|r}er1	a}ax d|j}jh o|u|r}er1
a|d}ae2 u}jh|uw	a}ae2 d}jh u}uw
u}y|uh a|b}b	u|a}y|uh b}b
x}k -}s	x}k|s -}_
u|r}er1 r}_	u|r|r}er1
o|r}axr r}_	o|r|r}axr
u|r}axr r}_	u|r|r}axr
e|r}axr r}_	e|r|r}axr
a|r}axr r}_ h|o}iy1 e}_	a|r|r|h}axr o|e}iy1
e|r}er r}_	e|r|r}er
i|r}er1 r}_	i|r|r}er1
u}_ a}aa1	u|a}aa1
w|h}hh i}w|er1 r}_	w|h}hh|w i|r}er1
b|o}b r}r	b}b o|r}r
e}_ a|r}er1	e|a|r}er1
q|u}k a}w|ey2	q}k u}w a}ey2
q|u}k a}w|ao1	q}k u}w a}ao1
w|h}hh a}w|ax	w|h}hh|w a}ax
t|u}ch r}axr	t}ch u|r}axr
d|u}jh a}uw|ax	d}jh u}uw a}ax
c|i}sh a}iy|ey2	c}sh i}iy a}ey2
i}ix a|t}t	i|a}ix t}t
w|h}hh e}w|iy1 a|t}t	w|h}hh|w e|a}iy1 t}t
q|u}k a}w|aa1	q}k u}w a}aa1
q|u}k a}w|ao2	q}k u}w a}ao2
q|u}k a}w|ae1	q}k u}w a}ae1
w|h}hh a}w|aa1	w|h}hh|w a}aa1
w|h}hh a}w|aa2	w|h}hh|w a}aa2
w|h}hh e}w|ae1	w|h}hh|w a}ae1
w|h}hh e}w|ae2	w|h}hh|w a}ae2
w|h}hh i}w|ay1	w|h}hh|w a}ay1
w|h}hh o}w|aa1	w|h}hh|w o}aa1
y|a}y c|h}aa1 t}t	y}y a}aa1 c|h}_ t}t
i}iy1 e}_	i|e}iy1
m|a}m '}_ a}ae1	m}m a|'|a}ae1
g|u}g e}_	g}g u|e}_
r}r h}_	r|h}r
s}z s|a}ix	s|s}z a}ix
_HERE_

my %replacements = ();
for my $rl (split('\n', $raw_replacements)) {
	next if($rl !~ /\t/);
	my @tmp = split(/\t/, $rl);
	$replacements{$tmp[0]} = $tmp[1];
}
my $regex_inner = join('|', map { quotemeta $_ } keys %replacements);

while(<>) {
	chomp;
	while(/(?:^| )($regex_inner)(?:$| )/g) {
		my $m = $1;
		my $qm = quotemeta($m);
		s/$qm/$replacements{$m}/;
	}
	my @phns = split/ /;
	my @out = ();
	for my $phn (@phns) {
		if($phn =~ /^([-'])\|/) {
			my $ch = $1;
			push @out, "$ch}_";
			push @out, substr($phn,2);
		} elsif($phn =~ /^([^\|])\|([-'])\}(.*)$/) {
			my $ch1 = $1;
			my $ch2 = $2;
			my $ch3 = $3;
			push @out, "$ch1}$ch3";
			push @out, "$ch2}_";
		} elsif($phn eq 'c}ao') {
			if($phns[0] eq 'n}n') {
				push @out, 'c}s';
			} else {
				push @out, 'c}k';
			}
		} else {
			push @out, $phn;
		}
	}
	print join(' ', @out) . "\n";
}

Writing clean_ngrams.pl

!cat TIMIT.clean.corpus | perl clean_ngrams.pl > TIMIT.cleaner.corpus

!estimate-ngram -o 8 -t TIMIT.cleaner.corpus -wl timit.arpa
# Convert to OpenFst format (10s-20s):
!phonetisaurus-arpa2wfst --lm=timit.arpa --ofile=timit.fst

0.001	Loading corpus TIMIT.cleaner.corpus...
0.026	Smoothing[1] = ModKN
0.026	Smoothing[2] = ModKN
0.026	Smoothing[3] = ModKN
0.026	Smoothing[4] = ModKN
0.026	Smoothing[5] = ModKN
0.026	Smoothing[6] = ModKN
0.026	Smoothing[7] = ModKN
0.026	Smoothing[8] = ModKN
0.026	Set smoothing algorithms...
0.026	Y 6.390977e-01
0.026	Y 6.202592e-01
0.026	Y 7.251729e-01
0.026	Y 7.967686e-01
0.027	Y 8.548704e-01
0.027	Y 9.046288e-01
0.027	Y 9.354281e-01
0.027	Y 9.105453e-01
0.027	Estimating full n-gram model...
0.029	Saving LM to timit.arpa...
GitRevision: 0.9.1
Initializing...
Converting...