%cd /opt
/opt
%%capture
!tar xvf /kaggle/input/extract-prebuilt-kaldi-from-docker/kaldi.tar
%cd kaldi/egs
/opt/kaldi/egs
!git clone https://github.com/danijel3/ClarinStudioKaldi
Cloning into 'ClarinStudioKaldi'...
remote: Enumerating objects: 778, done.
remote: Counting objects: 100% (3/3), done.
remote: Compressing objects: 100% (3/3), done.
remote: Total 778 (delta 0), reused 0 (delta 0), pack-reused 775
Receiving objects: 100% (778/778), 35.26 MiB | 19.96 MiB/s, done.
Resolving deltas: 100% (262/262), done.
%cd ClarinStudioKaldi
/opt/kaldi/egs/ClarinStudioKaldi
%%capture
!conda install -c bioconda perl-perlio-gzip -y
import os
os.environ['LD_LIBRARY_PATH'] = '/opt/conda/lib:/opt/kaldi/tools/openfst-1.6.7/lib:/opt/kaldi/src/lib'
!cat path.sh|sed -e 's/~\/apps/\/opt/' > tmp
!mv tmp path.sh
!echo > local_clarin/clarin_pl_clean.sh
!mkdir /kaggle/working/data
!ln -s /kaggle/working/data
%%writefile /opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply
#!/usr/bin/env python
# -*- mode: python; coding: utf-8 -*-

from __future__ import print_function
from __future__ import unicode_literals

import os, logging, subprocess, time, re
from datetime import datetime
from collections import defaultdict
import tempfile

class G2PModelTester () :
    """G2P Model training wrapper class.

    Phonetisaurus G2P modeling training wrapper class.
    This wraps the alignment, joint n-gram training, and ARPA to
    WFST conversion steps into one command.
    """

    def __init__ (self, model, **kwargs) :
        self.model = model
        self.lexicon_file = kwargs.get ("lexicon", None)
        self.nbest = kwargs.get ("nbest", 1)
        self.thresh = kwargs.get ("thresh", 99)
        self.beam = kwargs.get ("beam", 10000)
        self.greedy = kwargs.get ("greedy", False)
        self.accumulate = kwargs.get ("accumulate", False)
        self.pmass = kwargs.get ("pmass", 0.0)
        self.probs = kwargs.get ("probs", False)
        self.verbose = kwargs.get ("verbose", False)
        self.logger = self.setupLogger ()

    def setupLogger (self) :
        """Setup the logger and logging level.

        Setup the logger and logging level.  We only support
        verbose and non-verbose mode.

        Args:
            verbose (bool): Verbose mode, or not.

        Returns:
            Logger: A configured logger instance.
        """

        level = logging.DEBUG if self.verbose else logging.INFO
        logging.basicConfig (
            level=level,
            format="\033[94m%(levelname)s:%(name)s:"\
            "%(asctime)s\033[0m:  %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S"
        )

        return logging.getLogger ("phonetisaurus-apply")

    def _loadLexicon (self) :
        """Load the lexicon from a file.

        Load the reference lexicon from a file, and store it
        in a defaultdict (list).
        """

        _lexicon = defaultdict (list)
        if not self.lexicon_file :
            return _lexicon

        self.logger.debug ("Loading lexicon from file...")
        with open (self.lexicon_file, "r") as ifp :
            for line in ifp :
                # py2py3 compatbility,
                if sys.version_info[0] < 3:
                    line = line.decode("utf8").strip ()
                else:
                    line = line.strip ()
                word, pron = re.split (r"\t", line, 1)
                _lexicon [word].append (pron)

        return _lexicon

    def checkPhonetisaurusConfig (self) :
        """Run some basic checks before training.

        Run some basic checks regarding the $PATH, environment,
        and provided data before starting training.

        Raises:
            EnvironmentError: raised if binaries are not found.
        """

        self.logger.debug ("Checking command configuration...")
        for program in ["phonetisaurus-g2pfst"] :
            if not self.which (program) :
                raise EnvironmentError("Phonetisaurus command, '{0}', "\
                    "not found in path.".format (program))

        if self.lexicon_file and not os.path.exists (self.lexicon_file) :
            self.logger.error ("Could not find provided lexicon file.")
            sys.exit (1)

        for key,val in sorted (vars (self).items ()) :
            self.logger.debug (u"{0}:  {1}".format (key, val))

        self.lexicon = self._loadLexicon ()

        return

    def which (self, program) :
        """Basic 'which' implementation for python.

        Basic 'which' implementation for python from stackoverflow:
          * https://stackoverflow.com/a/377028/6739158

        Args:
            program (str): The program name to search the $PATH for.

        Returns:
            path/None: The path to the executable, or None.
        """

        def is_exe (fpath) :
            return os.path.isfile (fpath) and os.access (fpath, os.X_OK)

        fpath, fname = os.path.split (program)
        if fpath:
            if is_exe (program):
                return program
        else:
            for path in os.environ["PATH"].split (os.pathsep) :
                path = path.strip ('"')
                exe_file = os.path.join (path, program)
                if is_exe (exe_file):
                    return exe_file

        return None

    def makeG2PCommand (self, word_list) :
        """Build the G2P command.

        Build the G2P command from the provided arguments.

        Returns:
            list: The command in subprocess list format.
        """

        command = [
            u"phonetisaurus-g2pfst",
            u"--model={0}".format (self.model),
            u"--nbest={0}".format (self.nbest),
            u"--beam={0}".format (self.beam),
            u"--thresh={0}".format (self.thresh),
            u"--accumulate={0}".format (str (self.accumulate).lower ()),
            u"--pmass={0}".format (self.pmass),
            u"--nlog_probs={0}".format (str(not self.probs).lower ()),
            u"--wordlist={0}".format (word_list)
        ]

        self.logger.debug (u" ".join (command))

        return command

    def runG2PCommand (self, word_list_file) :
        """Generate and run the actual G2P command.

        Generate and run the actual G2P command.  Each synthesized
        entry will be yielded back on-the-fly via the subprocess
        stdout readline method.

        Args:
            word_list_file (str): The input word list.
        """
        g2p_command = self.makeG2PCommand (word_list_file)

        self.logger.debug ("Applying G2P model...")

        with open (os.devnull, "w") as devnull :
            proc = subprocess.Popen (
                g2p_command,
                stdout=subprocess.PIPE,
                stderr=devnull if not self.verbose else None
            )

            for line in proc.stdout :
                parts = re.split (r"\t", line.decode ("utf8").strip ())
                if not len (parts) == 3 :
                    self.logger.warning (
                        u"No pronunciation for word: '{0}'".format (parts [0])
                    )
                    continue

                yield parts

        return

    def applyG2POnly (self, word_list_file) :
        """Apply the G2P model to a word list.

        Apply the G2P model to a word list.  No filtering or application
        of a reference lexicon is used here.

        Args:
            word_list_file (str): The input word list.
        """
        for word, score, pron in self.runG2PCommand (word_list_file) :
            line = u""
            if self.verbose :
                line = u"{0}\t{1:.2f}\t{2}".format (
                    word, float (score), pron
                )
            else :
                line = u"{0}\t{1}".format (word, pron)
            # py2py3 compatbility,
            if sys.version_info[0] < 3:
                print (line.encode ("utf8"))
            else :
                print (line)

        return

    def applyG2PWithLexicon (self, word_list_file) :
        """Apply the G2P model to a word list, combined with lexicon.

        Apply the G2P model to a word list, but combine this with
        a reference lexicon.  Words for which a reference entry exists
        will not be sent to the G2P, unless the additional '--greedy'
        flag is set to True.

        Args:
            word_list_file (str): The input word list.
        """
        target_lexicon = defaultdict (list)
        tmpwordlist = tempfile.NamedTemporaryFile(mode='w', delete=False)

        #First, find any words in the target list for which we already
        # have a canonical pronunciation in the reference lexicon.
        with open (word_list_file, "r") as ifp :
            for word in ifp :
                # py2py3 compatbility,
                if sys.version_info[0] < 3:
                    word = word.decode ("utf8").strip ()
                else:
                    word = word.strip () # already in 'utf8'.
                if word in self.lexicon :
                    target_lexicon [word] = [(0.0,pron)
                                             for pron in self.lexicon [word]]
                    #In greedy mode we still send words to the G2P, even
                    # if we have canonical entries in the reference lexicon.
                    if self.greedy :
                        print (word.encode ("utf8"), file=tmpwordlist)
                else :
                    # py2py3 compatbility,
                    if sys.version_info[0] < 3:
                        print (word.encode ("utf8"), file=tmpwordlist)
                    else:
                        print (word, file=tmpwordlist)
        tmpwordlist.close ()

        #Second, iterate through the G2P output, and filter against
        # any possible duplicates previously found in the reference lexicon.
        for word, score, pron in self.runG2PCommand (tmpwordlist.name) :
            prons = set ([p for s,p in target_lexicon [word]])
            if pron in prons :
                continue
            target_lexicon [word].append ((score, pron))

        #Finally, sort everything that is left and print it.
        for word in sorted (target_lexicon.keys ()) :
            for score, pron in target_lexicon [word] :
                line = u""
                if self.verbose :
                    line = u"{0}\t{1:.2f}\t{2}".format (
                        word, float (score), pron
                    )
                else :
                    line = u"{0}\t{1}".format (word, pron)
                # py2py3 compatbility,
                if sys.version_info[0] < 3:
                    print (line.encode ("utf8"))
                else :
                    print (line)

        os.unlink (tmpwordlist.name)
        return

    def ApplyG2PModel (self, word_list_file) :
        """Apply the G2P model to a word list.

        Apply the G2P model to a word list.

        Args:
            word_list_file (str): The input word list.
        """
        self.checkPhonetisaurusConfig ()

        if not os.path.exists (word_list_file) \
           or not os.path.isfile (word_list_file) :
            raise IOError("Word list file not found.")

        if len (self.lexicon) == 0 :
            self.applyG2POnly (word_list_file)
        else :
            self.applyG2PWithLexicon (word_list_file)

        return

if __name__ == "__main__" :
    import sys, argparse

    example = "{0} --model train/model.fst --word test".format (sys.argv [0])

    parser  = argparse.ArgumentParser (description=example)
    parser.add_argument ("--model", "-m", help="Phonetisaurus G2P fst model.",
                         required=True)
    parser.add_argument ("--lexicon", "-l", help="Optional reference lexicon.",
                         required=False)
    parser.add_argument ("--nbest", "-n", help="Maximum number of hypotheses "
                         "to produce.  Overridden if --pmass is set.",
                         default=1, type=int)
    parser.add_argument ("--beam", "-b", help="Search 'beam'.",
                         default=10000, type=int)
    parser.add_argument ("--thresh", "-t", help="Pruning threshold for n-best.",
                         default=99.0, type=float)
    parser.add_argument ("--greedy", "-g", help="Use the G2P even if a "
                         "reference lexicon has been provided.", default=False,
                         action="store_true")
    parser.add_argument ("--accumulate", "-a", help="Accumulate probabilities "
                         "across unique pronunciations.", default=False,
                         action="store_true")
    parser.add_argument ("--pmass", "-p", help="Select the maximum number of "
                         "hypotheses summing to P total mass for a word.",
                         default=0.0, type=float)
    parser.add_argument ("--probs", "-pr", help="Print exp(-val) "
                         "instead of default -log values.", default=False,
                         action="store_true")
    parser.add_argument ("--word_list", "-wl", help="Input word or word list to apply "
                        "G2P model to.", type=str)

    parser.add_argument ("--verbose", "-v", help="Verbose mode.",
                         default=False, action="store_true")
    args = parser.parse_args ()

    tester = G2PModelTester (
        args.model,
        **{key:val for key,val in args.__dict__.items ()
           if not key in ["model","word_list"]}
    )

    tester.ApplyG2PModel (args.word_list)
Overwriting /opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply
!chmod a+x /opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply
%%writefile local_clarin/clarin_prepare_dict.sh
#!/bin/bash

# Copyright 2010-2012 Microsoft Corporation  
#           2012-2014 Johns Hopkins University (Author: Daniel Povey)
#                2015 Guoguo Chen

# Modified 2017 Danijel Korzinek

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Call this script from one level above, e.g. from the s3/ directory.  It puts
# its output in data/local/.

# The parts of the output of this that will be needed are
# [in data/local/dict/ ]
# lexicon.txt
# extra_questions.txt
# nonsilence_phones.txt
# optional_silence.txt
# silence_phones.txt

# run this from ../

echo "$0 $@"  # Print the command line for logging
. utils/parse_options.sh || exit 1;

. ./path.sh

if [ $# -ne 2 ]; then
  echo "Usage: ./local/prepare_lang.sh <word_list> <dict_dir>"
  echo "Creates a folder <dict_dir> with lexicon derived from"
  echo "  word list <word_list>."
  exit 1
fi

word_list=$1
dir=$2

mkdir -p $dir

# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.

# silence phones, one per line.
(echo sil) > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt

# nonsilence phones; on each line is a list of phones that correspond
# really to the same base phone.
printf "I\nS\nZ\na\nb\nd\ndZ\ndz\ndzi\ne\nen\nf\ng\ni\nj\nk\nl\nm\nn\nni\no\non\np\nr\ns\nsi\nt\ntS\nts\ntsi\nu\nv\nw\nx\nz\nzi\n" > $dir/nonsilence_phones.txt

# A few extra questions that will be added to those obtained by automatically clustering
# the "real" phones.  These ask about stress; there's also one for silence.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
 >> $dir/extra_questions.txt || exit 1;

#Transcribe the wordlist
export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib
export PATH=$PATH:/opt/kaldi/tools/phonetisaurus-g2p/
/opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply --model local_clarin/model.fst --lexicon local_clarin/lexicon.txt --word_list $word_list -p 0.8 > $dir/lexicon_raw_nosil.txt || exit 1

sort -u $dir/lexicon_raw_nosil.txt -o $dir/lexicon_raw_nosil.txt

# Add the silences, noises etc.
# the sort | uniq is to remove a duplicated pron.
# lexicon.txt is without the _B, _E, _S, _I markers.
(echo -e '<unk>\tsil' ) | \
 cat - $dir/lexicon_raw_nosil.txt | sort -u > $dir/lexicon.txt || exit 1;

# Cleanup
rm -f $dir/lexiconp.txt
rm -f $dir/lexicon_raw_nosil.txt

echo "Dictionary preparation succeeded"
Overwriting local_clarin/clarin_prepare_dict.sh
%%writefile local_clarin/clarin_pl_data_prep.sh
#!/bin/bash

. ./path.sh

#you can change this here, if you want it on a different partition, for example
AUDIO_DL_PATH=audio

if [ ! -d $AUDIO_DL_PATH ] ; then mkdir -p $AUDIO_DL_PATH ; fi
pushd $AUDIO_DL_PATH
if [ ! -f audio.tar.gz ] ; then
	echo "Downloading audio from the Clarin-pl website (~4.6GB)..."
	curl -O http://mowa.clarin-pl.eu/korpusy/audio.tar.gz
else
	echo "File already downloaded! Checking if download is consistent..."
	curl -O http://mowa.clarin-pl.eu/korpusy/audio.md5sum
	if ! md5sum -c audio.md5sum ; then
		echo "Download doesn't match the one on the server! "
		echo "Erase the audio.tar.gz file (and audio folder) and run this script again!"
		exit -1
    fi
fi

if [ ! -d audio	] ; then
	echo "Extracting files..."
	tar xf audio.tar.gz
else
	echo "Files already extracted?"
	echo "Remove the audio dir to extract them again..."
fi
popd

if [ ! -d data ] ; then mkdir data ; fi

echo Generating file lists using proper paths...
python3 local_clarin/generate_lists.py $AUDIO_DL_PATH/audio data local_clarin

echo Generating spk2utt...
utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt

echo Preparing dictionary...
if [ ! -d data/local ] ; then mkdir data/local ; fi

cut -f2- -d' ' < data/train/text | tr ' ' '\n' | sort -u > data/local/train.wlist
if [ x"$(which ngram)" != x"" ]
then
	ngram -lm local_clarin/arpa.lm.gz -unk -write-vocab data/local/lm.wlist
else
	perl local_clarin/extract_vocab.pl local_clarin/arpa.lm.gz > data/local/lm.wlist
fi
tail -n +5 data/local/lm.wlist | cat data/local/train.wlist - | sort -u > data/local/all.wlist
if [ ! -f local_clarin/model.fst ] ; then gunzip -c local_clarin/model.fst.gz > local_clarin/model.fst ; fi
local_clarin/clarin_prepare_dict.sh data/local/all.wlist data/local/dict_nosp || exit 1
Overwriting local_clarin/clarin_pl_data_prep.sh
%%writefile runmfcc.sh
#!/bin/bash

. ./path.sh ## set the paths in this file correctly!

# link to scripts from the standard Kaldi distribution
# we try to use these as much as possible
if [ ! -f $KALDI_ROOT/egs/wsj/s5/conf ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/conf ; fi
if [ ! -f $KALDI_ROOT/egs/wsj/s5/local ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/local ; fi
if [ ! -f $KALDI_ROOT/egs/wsj/s5/utils ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/utils ; fi
if [ ! -f $KALDI_ROOT/egs/wsj/s5/steps ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/steps ; fi

# exits script if error occurs anywhere
# you might not want to do this for interactive shells.
set -e

export nj=40 ##number of concurrent processes
export nj_test=30 ## number of concurrent processes for test has to be <=30

# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.

#run some initial data preparation (look at the file for more details):
local_clarin/clarin_pl_data_prep.sh

#prepare the lang directory
utils/prepare_lang.sh data/local/dict_nosp "<unk>" data/local/tmp_nosp data/lang_nosp

#make G.fst
utils/format_lm.sh data/lang_nosp local_clarin/arpa.lm.gz data/local/dict_nosp/lexicon.txt data/lang_nosp_test

# Make normalized MFCC features.
steps/make_mfcc.sh --nj $nj data/train
steps/compute_cmvn_stats.sh data/train
steps/make_mfcc.sh --nj $nj data/test
steps/compute_cmvn_stats.sh data/test
Writing runmfcc.sh
!bash runmfcc.sh