Training Kaldi on Kaggle - Data Prep
Training Kaldi on Kaggle needs to be split into steps
%cd /opt
%%capture
!tar xvf /kaggle/input/extract-prebuilt-kaldi-from-docker/kaldi.tar
%cd kaldi/egs
!git clone https://github.com/danijel3/ClarinStudioKaldi
%cd ClarinStudioKaldi
%%capture
!conda install -c bioconda perl-perlio-gzip -y
import os
os.environ['LD_LIBRARY_PATH'] = '/opt/conda/lib:/opt/kaldi/tools/openfst-1.6.7/lib:/opt/kaldi/src/lib'
!cat path.sh|sed -e 's/~\/apps/\/opt/' > tmp
!mv tmp path.sh
!echo > local_clarin/clarin_pl_clean.sh
!mkdir /kaggle/working/data
!ln -s /kaggle/working/data
%%writefile /opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply
#!/usr/bin/env python
# -*- mode: python; coding: utf-8 -*-
from __future__ import print_function
from __future__ import unicode_literals
import os, logging, subprocess, time, re
from datetime import datetime
from collections import defaultdict
import tempfile
class G2PModelTester () :
"""G2P Model training wrapper class.
Phonetisaurus G2P modeling training wrapper class.
This wraps the alignment, joint n-gram training, and ARPA to
WFST conversion steps into one command.
"""
def __init__ (self, model, **kwargs) :
self.model = model
self.lexicon_file = kwargs.get ("lexicon", None)
self.nbest = kwargs.get ("nbest", 1)
self.thresh = kwargs.get ("thresh", 99)
self.beam = kwargs.get ("beam", 10000)
self.greedy = kwargs.get ("greedy", False)
self.accumulate = kwargs.get ("accumulate", False)
self.pmass = kwargs.get ("pmass", 0.0)
self.probs = kwargs.get ("probs", False)
self.verbose = kwargs.get ("verbose", False)
self.logger = self.setupLogger ()
def setupLogger (self) :
"""Setup the logger and logging level.
Setup the logger and logging level. We only support
verbose and non-verbose mode.
Args:
verbose (bool): Verbose mode, or not.
Returns:
Logger: A configured logger instance.
"""
level = logging.DEBUG if self.verbose else logging.INFO
logging.basicConfig (
level=level,
format="\033[94m%(levelname)s:%(name)s:"\
"%(asctime)s\033[0m: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
return logging.getLogger ("phonetisaurus-apply")
def _loadLexicon (self) :
"""Load the lexicon from a file.
Load the reference lexicon from a file, and store it
in a defaultdict (list).
"""
_lexicon = defaultdict (list)
if not self.lexicon_file :
return _lexicon
self.logger.debug ("Loading lexicon from file...")
with open (self.lexicon_file, "r") as ifp :
for line in ifp :
# py2py3 compatbility,
if sys.version_info[0] < 3:
line = line.decode("utf8").strip ()
else:
line = line.strip ()
word, pron = re.split (r"\t", line, 1)
_lexicon [word].append (pron)
return _lexicon
def checkPhonetisaurusConfig (self) :
"""Run some basic checks before training.
Run some basic checks regarding the $PATH, environment,
and provided data before starting training.
Raises:
EnvironmentError: raised if binaries are not found.
"""
self.logger.debug ("Checking command configuration...")
for program in ["phonetisaurus-g2pfst"] :
if not self.which (program) :
raise EnvironmentError("Phonetisaurus command, '{0}', "\
"not found in path.".format (program))
if self.lexicon_file and not os.path.exists (self.lexicon_file) :
self.logger.error ("Could not find provided lexicon file.")
sys.exit (1)
for key,val in sorted (vars (self).items ()) :
self.logger.debug (u"{0}: {1}".format (key, val))
self.lexicon = self._loadLexicon ()
return
def which (self, program) :
"""Basic 'which' implementation for python.
Basic 'which' implementation for python from stackoverflow:
* https://stackoverflow.com/a/377028/6739158
Args:
program (str): The program name to search the $PATH for.
Returns:
path/None: The path to the executable, or None.
"""
def is_exe (fpath) :
return os.path.isfile (fpath) and os.access (fpath, os.X_OK)
fpath, fname = os.path.split (program)
if fpath:
if is_exe (program):
return program
else:
for path in os.environ["PATH"].split (os.pathsep) :
path = path.strip ('"')
exe_file = os.path.join (path, program)
if is_exe (exe_file):
return exe_file
return None
def makeG2PCommand (self, word_list) :
"""Build the G2P command.
Build the G2P command from the provided arguments.
Returns:
list: The command in subprocess list format.
"""
command = [
u"phonetisaurus-g2pfst",
u"--model={0}".format (self.model),
u"--nbest={0}".format (self.nbest),
u"--beam={0}".format (self.beam),
u"--thresh={0}".format (self.thresh),
u"--accumulate={0}".format (str (self.accumulate).lower ()),
u"--pmass={0}".format (self.pmass),
u"--nlog_probs={0}".format (str(not self.probs).lower ()),
u"--wordlist={0}".format (word_list)
]
self.logger.debug (u" ".join (command))
return command
def runG2PCommand (self, word_list_file) :
"""Generate and run the actual G2P command.
Generate and run the actual G2P command. Each synthesized
entry will be yielded back on-the-fly via the subprocess
stdout readline method.
Args:
word_list_file (str): The input word list.
"""
g2p_command = self.makeG2PCommand (word_list_file)
self.logger.debug ("Applying G2P model...")
with open (os.devnull, "w") as devnull :
proc = subprocess.Popen (
g2p_command,
stdout=subprocess.PIPE,
stderr=devnull if not self.verbose else None
)
for line in proc.stdout :
parts = re.split (r"\t", line.decode ("utf8").strip ())
if not len (parts) == 3 :
self.logger.warning (
u"No pronunciation for word: '{0}'".format (parts [0])
)
continue
yield parts
return
def applyG2POnly (self, word_list_file) :
"""Apply the G2P model to a word list.
Apply the G2P model to a word list. No filtering or application
of a reference lexicon is used here.
Args:
word_list_file (str): The input word list.
"""
for word, score, pron in self.runG2PCommand (word_list_file) :
line = u""
if self.verbose :
line = u"{0}\t{1:.2f}\t{2}".format (
word, float (score), pron
)
else :
line = u"{0}\t{1}".format (word, pron)
# py2py3 compatbility,
if sys.version_info[0] < 3:
print (line.encode ("utf8"))
else :
print (line)
return
def applyG2PWithLexicon (self, word_list_file) :
"""Apply the G2P model to a word list, combined with lexicon.
Apply the G2P model to a word list, but combine this with
a reference lexicon. Words for which a reference entry exists
will not be sent to the G2P, unless the additional '--greedy'
flag is set to True.
Args:
word_list_file (str): The input word list.
"""
target_lexicon = defaultdict (list)
tmpwordlist = tempfile.NamedTemporaryFile(mode='w', delete=False)
#First, find any words in the target list for which we already
# have a canonical pronunciation in the reference lexicon.
with open (word_list_file, "r") as ifp :
for word in ifp :
# py2py3 compatbility,
if sys.version_info[0] < 3:
word = word.decode ("utf8").strip ()
else:
word = word.strip () # already in 'utf8'.
if word in self.lexicon :
target_lexicon [word] = [(0.0,pron)
for pron in self.lexicon [word]]
#In greedy mode we still send words to the G2P, even
# if we have canonical entries in the reference lexicon.
if self.greedy :
print (word.encode ("utf8"), file=tmpwordlist)
else :
# py2py3 compatbility,
if sys.version_info[0] < 3:
print (word.encode ("utf8"), file=tmpwordlist)
else:
print (word, file=tmpwordlist)
tmpwordlist.close ()
#Second, iterate through the G2P output, and filter against
# any possible duplicates previously found in the reference lexicon.
for word, score, pron in self.runG2PCommand (tmpwordlist.name) :
prons = set ([p for s,p in target_lexicon [word]])
if pron in prons :
continue
target_lexicon [word].append ((score, pron))
#Finally, sort everything that is left and print it.
for word in sorted (target_lexicon.keys ()) :
for score, pron in target_lexicon [word] :
line = u""
if self.verbose :
line = u"{0}\t{1:.2f}\t{2}".format (
word, float (score), pron
)
else :
line = u"{0}\t{1}".format (word, pron)
# py2py3 compatbility,
if sys.version_info[0] < 3:
print (line.encode ("utf8"))
else :
print (line)
os.unlink (tmpwordlist.name)
return
def ApplyG2PModel (self, word_list_file) :
"""Apply the G2P model to a word list.
Apply the G2P model to a word list.
Args:
word_list_file (str): The input word list.
"""
self.checkPhonetisaurusConfig ()
if not os.path.exists (word_list_file) \
or not os.path.isfile (word_list_file) :
raise IOError("Word list file not found.")
if len (self.lexicon) == 0 :
self.applyG2POnly (word_list_file)
else :
self.applyG2PWithLexicon (word_list_file)
return
if __name__ == "__main__" :
import sys, argparse
example = "{0} --model train/model.fst --word test".format (sys.argv [0])
parser = argparse.ArgumentParser (description=example)
parser.add_argument ("--model", "-m", help="Phonetisaurus G2P fst model.",
required=True)
parser.add_argument ("--lexicon", "-l", help="Optional reference lexicon.",
required=False)
parser.add_argument ("--nbest", "-n", help="Maximum number of hypotheses "
"to produce. Overridden if --pmass is set.",
default=1, type=int)
parser.add_argument ("--beam", "-b", help="Search 'beam'.",
default=10000, type=int)
parser.add_argument ("--thresh", "-t", help="Pruning threshold for n-best.",
default=99.0, type=float)
parser.add_argument ("--greedy", "-g", help="Use the G2P even if a "
"reference lexicon has been provided.", default=False,
action="store_true")
parser.add_argument ("--accumulate", "-a", help="Accumulate probabilities "
"across unique pronunciations.", default=False,
action="store_true")
parser.add_argument ("--pmass", "-p", help="Select the maximum number of "
"hypotheses summing to P total mass for a word.",
default=0.0, type=float)
parser.add_argument ("--probs", "-pr", help="Print exp(-val) "
"instead of default -log values.", default=False,
action="store_true")
parser.add_argument ("--word_list", "-wl", help="Input word or word list to apply "
"G2P model to.", type=str)
parser.add_argument ("--verbose", "-v", help="Verbose mode.",
default=False, action="store_true")
args = parser.parse_args ()
tester = G2PModelTester (
args.model,
**{key:val for key,val in args.__dict__.items ()
if not key in ["model","word_list"]}
)
tester.ApplyG2PModel (args.word_list)
!chmod a+x /opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply
%%writefile local_clarin/clarin_prepare_dict.sh
#!/bin/bash
# Copyright 2010-2012 Microsoft Corporation
# 2012-2014 Johns Hopkins University (Author: Daniel Povey)
# 2015 Guoguo Chen
# Modified 2017 Danijel Korzinek
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Call this script from one level above, e.g. from the s3/ directory. It puts
# its output in data/local/.
# The parts of the output of this that will be needed are
# [in data/local/dict/ ]
# lexicon.txt
# extra_questions.txt
# nonsilence_phones.txt
# optional_silence.txt
# silence_phones.txt
# run this from ../
echo "$0 $@" # Print the command line for logging
. utils/parse_options.sh || exit 1;
. ./path.sh
if [ $# -ne 2 ]; then
echo "Usage: ./local/prepare_lang.sh <word_list> <dict_dir>"
echo "Creates a folder <dict_dir> with lexicon derived from"
echo " word list <word_list>."
exit 1
fi
word_list=$1
dir=$2
mkdir -p $dir
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
# silence phones, one per line.
(echo sil) > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt
# nonsilence phones; on each line is a list of phones that correspond
# really to the same base phone.
printf "I\nS\nZ\na\nb\nd\ndZ\ndz\ndzi\ne\nen\nf\ng\ni\nj\nk\nl\nm\nn\nni\no\non\np\nr\ns\nsi\nt\ntS\nts\ntsi\nu\nv\nw\nx\nz\nzi\n" > $dir/nonsilence_phones.txt
# A few extra questions that will be added to those obtained by automatically clustering
# the "real" phones. These ask about stress; there's also one for silence.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
>> $dir/extra_questions.txt || exit 1;
#Transcribe the wordlist
export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib
export PATH=$PATH:/opt/kaldi/tools/phonetisaurus-g2p/
/opt/kaldi/tools/phonetisaurus-g2p/src/scripts/phonetisaurus-apply --model local_clarin/model.fst --lexicon local_clarin/lexicon.txt --word_list $word_list -p 0.8 > $dir/lexicon_raw_nosil.txt || exit 1
sort -u $dir/lexicon_raw_nosil.txt -o $dir/lexicon_raw_nosil.txt
# Add the silences, noises etc.
# the sort | uniq is to remove a duplicated pron.
# lexicon.txt is without the _B, _E, _S, _I markers.
(echo -e '<unk>\tsil' ) | \
cat - $dir/lexicon_raw_nosil.txt | sort -u > $dir/lexicon.txt || exit 1;
# Cleanup
rm -f $dir/lexiconp.txt
rm -f $dir/lexicon_raw_nosil.txt
echo "Dictionary preparation succeeded"
%%writefile local_clarin/clarin_pl_data_prep.sh
#!/bin/bash
. ./path.sh
#you can change this here, if you want it on a different partition, for example
AUDIO_DL_PATH=audio
if [ ! -d $AUDIO_DL_PATH ] ; then mkdir -p $AUDIO_DL_PATH ; fi
pushd $AUDIO_DL_PATH
if [ ! -f audio.tar.gz ] ; then
echo "Downloading audio from the Clarin-pl website (~4.6GB)..."
curl -O http://mowa.clarin-pl.eu/korpusy/audio.tar.gz
else
echo "File already downloaded! Checking if download is consistent..."
curl -O http://mowa.clarin-pl.eu/korpusy/audio.md5sum
if ! md5sum -c audio.md5sum ; then
echo "Download doesn't match the one on the server! "
echo "Erase the audio.tar.gz file (and audio folder) and run this script again!"
exit -1
fi
fi
if [ ! -d audio ] ; then
echo "Extracting files..."
tar xf audio.tar.gz
else
echo "Files already extracted?"
echo "Remove the audio dir to extract them again..."
fi
popd
if [ ! -d data ] ; then mkdir data ; fi
echo Generating file lists using proper paths...
python3 local_clarin/generate_lists.py $AUDIO_DL_PATH/audio data local_clarin
echo Generating spk2utt...
utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
echo Preparing dictionary...
if [ ! -d data/local ] ; then mkdir data/local ; fi
cut -f2- -d' ' < data/train/text | tr ' ' '\n' | sort -u > data/local/train.wlist
if [ x"$(which ngram)" != x"" ]
then
ngram -lm local_clarin/arpa.lm.gz -unk -write-vocab data/local/lm.wlist
else
perl local_clarin/extract_vocab.pl local_clarin/arpa.lm.gz > data/local/lm.wlist
fi
tail -n +5 data/local/lm.wlist | cat data/local/train.wlist - | sort -u > data/local/all.wlist
if [ ! -f local_clarin/model.fst ] ; then gunzip -c local_clarin/model.fst.gz > local_clarin/model.fst ; fi
local_clarin/clarin_prepare_dict.sh data/local/all.wlist data/local/dict_nosp || exit 1
%%writefile runmfcc.sh
#!/bin/bash
. ./path.sh ## set the paths in this file correctly!
# link to scripts from the standard Kaldi distribution
# we try to use these as much as possible
if [ ! -f $KALDI_ROOT/egs/wsj/s5/conf ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/conf ; fi
if [ ! -f $KALDI_ROOT/egs/wsj/s5/local ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/local ; fi
if [ ! -f $KALDI_ROOT/egs/wsj/s5/utils ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/utils ; fi
if [ ! -f $KALDI_ROOT/egs/wsj/s5/steps ] ; then ln -s $KALDI_ROOT/egs/wsj/s5/steps ; fi
# exits script if error occurs anywhere
# you might not want to do this for interactive shells.
set -e
export nj=40 ##number of concurrent processes
export nj_test=30 ## number of concurrent processes for test has to be <=30
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
#run some initial data preparation (look at the file for more details):
local_clarin/clarin_pl_data_prep.sh
#prepare the lang directory
utils/prepare_lang.sh data/local/dict_nosp "<unk>" data/local/tmp_nosp data/lang_nosp
#make G.fst
utils/format_lm.sh data/lang_nosp local_clarin/arpa.lm.gz data/local/dict_nosp/lexicon.txt data/lang_nosp_test
# Make normalized MFCC features.
steps/make_mfcc.sh --nj $nj data/train
steps/compute_cmvn_stats.sh data/train
steps/make_mfcc.sh --nj $nj data/test
steps/compute_cmvn_stats.sh data/test
!bash runmfcc.sh