wav2vec-u CV-sv - prepare text
Running prepare_text.sh for wav2vec-u on Common Voice Swedish
Original here
%cd /opt
%%capture
!tar xvf /kaggle/input/extract-prebuilt-kaldi-from-docker/kaldi.tar
%cd /tmp
!git clone https://github.com/pytorch/fairseq/
%%capture
!pip install phonemizer
%%capture
!pip install git+https://github.com/pytorch/fairseq/
%%capture
!apt-get -y install espeak
!git clone https://github.com/kpu/kenlm
%%capture
!apt-get -y install libeigen3-dev liblzma-dev zlib1g-dev libbz2-dev
%%capture
%cd kenlm
!mkdir build
%cd build
!cmake ..
!make -j 4
%cd /tmp
import os
os.environ['PATH'] = f"{os.environ['PATH']}:/tmp/kenlm/build/bin/"
os.environ['FAIRSEQ_ROOT'] = '/tmp/fairseq'
!cat /kaggle/input/wav2vec-u-cv-swedish-audio/*.wrd | grep -v '^$' | sort| uniq > /kaggle/working/sentences.txt
%cd fairseq/examples/wav2vec/unsupervised
%%capture
!apt-get -y install zsh
!mkdir /kaggle/working/preppedtext
%cd scripts
The next part requires a FastText language id model; I don't know where the 187 language model comes from, but there is a model for 176 languages here
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
!cat normalize_and_filter_text.py|sed -e 's/187/176/' > tmp
!mv tmp normalize_and_filter_text.py
os.environ['HYDRA_FULL_ERROR'] = '1'
import os
os.environ['LD_LIBRARY_PATH'] = '/opt/conda/lib:/opt/kaldi/tools/openfst-1.6.7/lib:/opt/kaldi/src/lib'
There are two lines with missing variables in prepare_text.sh
- pull request - so replace the file.
While I'm replacing the file: most of the first part of the script is unneeded, as I already have a phonetic dictionary, so I'm using that instead.
With the calls of the preprocess.py
script, make sure to check the threshold: there's a divide by zero if the threshold is set too high.
Config options for kaldi_initializer.py
in_labels
: a naming component, for the Kaldi lexicons/fsts (required)wav2letter_lexicon
: path to wav2letter lexiconout_labels
: a naming component, for the Kaldi lexicons/fsts: set toin_label
if missingkaldi_root
: path to Kaldi:/opt/kaldi
for my kaggle imagefst_dir
: path where generated fsts will be saveddata_dir
: path to phones datalm_arpa
: path to the lm in ARPA formatblank_symbol
: CTC blank symbol (<s>
here)silence_symbol
: Kaldi symbol for silence (<SIL>
is set for two of the scripts)
A config file needs to exist for this, even though the options set in it seem to be ignored.
!mkdir /tmp/fairseq/examples/speech_recognition/kaldi/config/
%%writefile /tmp/fairseq/examples/speech_recognition/kaldi/config/config.yaml
kaldi_root: "/opt/kaldi"
%%writefile prepare_text.sh
#!/usr/bin/env zsh
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
lg=$1
text_path=$2
target_dir=$3
#ph_lg=${lg:l}
#if test "$lg" = 'fr'; then
# ph_lg='fr-fr'
#elif test "$lg" = 'en'; then
# ph_lg='en-us'
#elif test "$lg" = 'pt'; then
# ph_lg='pt-br'
#fi
ph_lg="sv"
echo $lg
echo $ph_lg
echo $text_path
echo $target_dir
mkdir -p $target_dir
#python normalize_and_filter_text.py --lang $lg < $text_path | grep -v '\-\-\-' >! $target_dir/lm.upper.lid.txt
#python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/lm.upper.lid.txt --only-source --destdir $target_dir --thresholdsrc 2 --padding-factor 1 --dict-only
#cut -f1 -d' ' $target_dir/dict.txt | grep -v -x '[[:punct:]]*' | grep -Pv '\d\d\d\d\d+' >! $target_dir/words.txt
cp /kaggle/input/wav2vec-u-cv-swedish-audio/train.wrd $target_dir/lm.upper.lid.txt
cut -f1 -d' ' /kaggle/input/wav2vec-u-cv-swedish-audio/dict.train >! $target_dir/words.txt
#one=$(echo "1" | PHONEMIZER_ESPEAK_PATH=$(which espeak) phonemize -p ' ' -w '' -l $ph_lg --language-switch remove-flags)
#sed 's/$/ 1/' $target_dir/words.txt | PHONEMIZER_ESPEAK_PATH=$(which espeak) phonemize -o $target_dir/phones.txt -p ' ' -w '' -l $ph_lg -j 70 --language-switch remove-flags
cut -f2- -d' ' /kaggle/input/wav2vec-u-cv-swedish-audio/dict.train >! $target_dir/phones.txt
#echo "one is ${one}"
#sed -i "s/${one}$//" $target_dir/phones.txt
#paste $target_dir/words.txt $target_dir/phones.txt >! $target_dir/lexicon.lst
cp /kaggle/input/wav2vec-u-cv-swedish-audio/dict.train $target_dir/lexicon.lst
#python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones.txt --only-source --destdir $target_dir/phones --thresholdsrc 1000 --padding-factor 1 --dict-only
python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones.txt --only-source --destdir $target_dir/phones --thresholdsrc 2 --padding-factor 1 --dict-only
python filter_lexicon.py -d $target_dir/phones/dict.txt < $target_dir/lexicon.lst >! $target_dir/lexicon_filtered.lst
python phonemize_with_sil.py -s 0.25 --surround --lexicon $target_dir/lexicon_filtered.lst < $target_dir/lm.upper.lid.txt >! $target_dir/phones/lm.phones.filtered.txt
cp $target_dir/phones/dict.txt $target_dir/phones/dict.phn.txt
echo "<SIL> 0" >> $target_dir/phones/dict.phn.txt
python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones/lm.phones.filtered.txt --workers 70 --only-source --destdir $target_dir/phones --srcdict $target_dir/phones/dict.phn.txt
lmplz -o 4 < $target_dir/lm.upper.lid.txt --discount_fallback --prune 0 0 0 3 >! $target_dir/kenlm.wrd.o40003.arpa
build_binary $target_dir/kenlm.wrd.o40003.arpa $target_dir/kenlm.wrd.o40003.bin
lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py fst_dir=$target_dir/fst/phn_to_words_sil lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones "blank_symbol='<SIL>'" "in_labels='phn'" "kaldi_root='/opt/kaldi'"
lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py fst_dir=$target_dir/fst/phn_to_words lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones "in_labels='phn'" "kaldi_root='/opt/kaldi'"
lmplz -o 4 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.04.arpa
build_binary -s $target_dir/phones/lm.phones.filtered.04.arpa $target_dir/phones/lm.phones.filtered.04.bin
lmplz -o 6 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.06.arpa
build_binary -s $target_dir/phones/lm.phones.filtered.06.arpa $target_dir/phones/lm.phones.filtered.06.bin
lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py fst_dir=$target_dir/fst/phn_to_phn_sil lm_arpa=$target_dir/phones/lm.phones.filtered.06.arpa data_dir=$target_dir/phones "blank_symbol='<SIL>'" "in_labels='phn'" "kaldi_root='/opt/kaldi'"
add-self-loop-simple.cc
attempts to use std::endl
with KALDI_LOG
, which doesn't work, so rewrite that (I'm not sure if this actually prevents anything from working, but it is really distracting).
%%writefile /tmp/fairseq/examples/speech_recognition/kaldi/add-self-loop-simple.cc
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <iostream>
#include "fstext/fstext-lib.h" // @manual
#include "util/common-utils.h" // @manual
/*
* This program is to modify a FST without self-loop by:
* for each incoming arc with non-eps input symbol, add a self-loop arc
* with that non-eps symbol as input and eps as output.
*
* This is to make sure the resultant FST can do deduplication for repeated
* symbols, which is very common in acoustic model
*
*/
namespace {
int32 AddSelfLoopsSimple(fst::StdVectorFst* fst) {
typedef fst::MutableArcIterator<fst::StdVectorFst> IterType;
int32 num_states_before = fst->NumStates();
fst::MakePrecedingInputSymbolsSame(false, fst);
int32 num_states_after = fst->NumStates();
KALDI_LOG << "There are " << num_states_before
<< " states in the original FST; "
<< " after MakePrecedingInputSymbolsSame, there are "
<< num_states_after << " states ";
auto weight_one = fst::StdArc::Weight::One();
int32 num_arc_added = 0;
fst::StdArc self_loop_arc;
self_loop_arc.weight = weight_one;
int32 num_states = fst->NumStates();
std::vector<std::set<int32>> incoming_non_eps_label_per_state(num_states);
for (int32 state = 0; state < num_states; state++) {
for (IterType aiter(fst, state); !aiter.Done(); aiter.Next()) {
fst::StdArc arc(aiter.Value());
if (arc.ilabel != 0) {
incoming_non_eps_label_per_state[arc.nextstate].insert(arc.ilabel);
}
}
}
for (int32 state = 0; state < num_states; state++) {
if (!incoming_non_eps_label_per_state[state].empty()) {
auto& ilabel_set = incoming_non_eps_label_per_state[state];
for (auto it = ilabel_set.begin(); it != ilabel_set.end(); it++) {
self_loop_arc.ilabel = *it;
self_loop_arc.olabel = 0;
self_loop_arc.nextstate = state;
fst->AddArc(state, self_loop_arc);
num_arc_added++;
}
}
}
return num_arc_added;
}
void print_usage() {
std::cout << "add-self-loop-simple usage:\n"
"\tadd-self-loop-simple <in-fst> <out-fst> \n";
}
} // namespace
int main(int argc, char** argv) {
if (argc != 3) {
print_usage();
exit(1);
}
auto input = argv[1];
auto output = argv[2];
auto fst = fst::ReadFstKaldi(input);
auto num_states = fst->NumStates();
KALDI_LOG << "Loading FST from " << input << " with " << num_states
<< " states.";
int32 num_arc_added = AddSelfLoopsSimple(fst);
KALDI_LOG << "Adding " << num_arc_added << " self-loop arcs ";
fst::WriteFstKaldi(*fst, std::string(output));
KALDI_LOG << "Writing FST to " << output;
delete fst;
}
!zsh prepare_text.sh sv /kaggle/working/sentences.txt /kaggle/working/preppedtext