Kaldi on Kaggle, ClarinStudio PL Mono iters 30-40
This was my fault; the lexicon wasn't being created properly and it took too long to align
%cd /opt
%%capture
!tar xvf /kaggle/input/extract-prebuilt-kaldi-from-docker/kaldi.tar
%cd kaldi/egs
!git clone https://github.com/danijel3/ClarinStudioKaldi
%cd ClarinStudioKaldi
%%capture
!conda install -c bioconda perl-perlio-gzip -y
import os
os.environ['LD_LIBRARY_PATH'] = '/opt/conda/lib:/opt/kaldi/tools/openfst-1.6.7/lib:/opt/kaldi/src/lib'
!cat path.sh|sed -e 's/~\/apps/\/opt/' > tmp
!mv tmp path.sh
!echo > local_clarin/clarin_pl_clean.sh
!ln -s ../wsj/s5/steps
!ln -s ../wsj/s5/conf
!ln -s ../wsj/s5/local
!ln -s ../wsj/s5/utils
!cp -r /kaggle/input/kaldi-clarinstudio-polish-train-mono-1-30/data /kaggle/working/
!cp -r /kaggle/input/kaldi-clarinstudio-polish-train-mono-1-30/exp /kaggle/working/
!ln -s /kaggle/working/exp
!ln -s /kaggle/working/data
!find /kaggle/working/exp -name '*.log' -delete
!/opt/kaldi/src/gmmbin/gmm-info --print-args=false exp/mono0/30.mdl | grep gaussians | awk '{print $NF}'
%%writefile train_mono.sh
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# 2019 Xiaohui Zhang
# Apache 2.0
# Trimmed down from WSJ train_mono.sh, to continue from 30
# Begin configuration section.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=40 # Number of iterations of training
max_iter_inc=30 # Last iter to increase #Gauss on.
regular_beam=10 # beam used after the first iteration
retry_beam=40
totgauss=1000 # Target #Gaussians.
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
config= # name of config file.
stage=-4
power=0.25
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: steps/train_mono.sh [options] <data-dir> <lang-dir> <exp-dir>"
echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
data=$1
lang=$2
dir=$3
oov_sym=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"
cp $lang/phones.txt $dir || exit 1;
numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'`
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
# update from last run
#numgauss=`gmm-info --print-args=false $dir/30.mdl | grep gaussians | awk '{print $NF}'`
#numgauss=925
igauss=1
while [ $igauss -lt 30 ];do
numgauss=$[$numgauss+$incgauss];
igauss=$[$igauss+1]
done
# beam is only set to $initial_beam for first run
beam=$regular_beam
x=30
while [ $x -lt $num_iters ]; do
echo "$0: Pass $x"
if [ $stage -le $x ]; then
if echo $realign_iters | grep -w $x >/dev/null; then
echo "$0: Aligning data"
mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
$cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
"ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
|| exit 1;
fi
$cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
gmm-acc-stats-ali $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \
$dir/$x.JOB.acc || exit 1;
$cmd $dir/log/update.$x.log \
gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
fi
if [ $x -le $max_iter_inc ]; then
numgauss=$[$numgauss+$incgauss];
fi
beam=$regular_beam
x=$[$x+1]
done
( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
utils/summarize_warnings.pl $dir/log
steps/info/gmm_dir_info.pl $dir
echo "$0: Done training monophone system in $dir"
exit 0
!bash train_mono.sh --nj 40 data/train data/lang_nosp exp/mono0