%cd /opt
/opt
%%capture
!tar xvf /kaggle/input/extract-prebuilt-kaldi-from-docker/kaldi.tar
%cd kaldi/egs
/opt/kaldi/egs
!git clone https://github.com/danijel3/ClarinStudioKaldi
Cloning into 'ClarinStudioKaldi'...
remote: Enumerating objects: 778, done.
remote: Counting objects: 100% (3/3), done.
remote: Compressing objects: 100% (3/3), done.
remote: Total 778 (delta 0), reused 0 (delta 0), pack-reused 775
Receiving objects: 100% (778/778), 35.26 MiB | 22.14 MiB/s, done.
Resolving deltas: 100% (262/262), done.
%cd ClarinStudioKaldi
/opt/kaldi/egs/ClarinStudioKaldi
%%capture
!conda install -c bioconda perl-perlio-gzip -y
import os
os.environ['LD_LIBRARY_PATH'] = '/opt/conda/lib:/opt/kaldi/tools/openfst-1.6.7/lib:/opt/kaldi/src/lib'
!cat path.sh|sed -e 's/~\/apps/\/opt/' > tmp
!mv tmp path.sh
!echo > local_clarin/clarin_pl_clean.sh
!ln -s ../wsj/s5/steps
!ln -s ../wsj/s5/conf
!ln -s ../wsj/s5/local
!ln -s ../wsj/s5/utils
!cp -r /kaggle/input/kaldi-clarinstudio-polish-train-mono-1-30/data /kaggle/working/
!cp -r /kaggle/input/kaldi-clarinstudio-polish-train-mono-1-30/exp /kaggle/working/
!ln -s /kaggle/working/exp
!ln -s /kaggle/working/data
!find /kaggle/working/exp -name '*.log' -delete
!/opt/kaldi/src/gmmbin/gmm-info --print-args=false exp/mono0/30.mdl | grep gaussians | awk '{print $NF}'
925
%%writefile train_mono.sh
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#           2019  Xiaohui Zhang
# Apache 2.0

# Trimmed down from WSJ train_mono.sh, to continue from 30
# Begin configuration section.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=40    # Number of iterations of training
max_iter_inc=30 # Last iter to increase #Gauss on.
regular_beam=10 # beam used after the first iteration
retry_beam=40
totgauss=1000 # Target #Gaussians.
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
config= # name of config file.
stage=-4
power=0.25
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: steps/train_mono.sh [options] <data-dir> <lang-dir> <exp-dir>"
  echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  exit 1;
fi

data=$1
lang=$2
dir=$3

oov_sym=`cat $lang/oov.int` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"

cp $lang/phones.txt $dir || exit 1;

numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'`
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
# update from last run
#numgauss=`gmm-info --print-args=false $dir/30.mdl | grep gaussians | awk '{print $NF}'`
#numgauss=925
igauss=1
while [ $igauss -lt 30 ];do
    numgauss=$[$numgauss+$incgauss];
    igauss=$[$igauss+1]
done

# beam is only set to $initial_beam for first run
beam=$regular_beam
x=30
while [ $x -lt $num_iters ]; do
  echo "$0: Pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: Aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
        || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \
      $dir/$x.JOB.acc || exit 1;

    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
  fi
  if [ $x -le $max_iter_inc ]; then
     numgauss=$[$numgauss+$incgauss];
  fi
  beam=$regular_beam
  x=$[$x+1]
done

( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )


steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
utils/summarize_warnings.pl $dir/log

steps/info/gmm_dir_info.pl $dir

echo "$0: Done training monophone system in $dir"

exit 0
Writing train_mono.sh
!bash train_mono.sh --nj 40 data/train data/lang_nosp exp/mono0