Title
> "Train a model for MFA on Irish data on Kaggle"
- toc: false
- branch: master
- hidden: true
- categories: [kaggle, irish, mfa]
Original on [Kaggle](https://www.kaggle.com/jimregan/train-irish-mfa-model)
%%capture
import os
os.chdir('/tmp')
!wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
!tar zxvf montreal-forced-aligner_linux.tar.gz
!ln -s /tmp/montreal-forced-aligner/lib/libpython3.6m.so.1.0 /tmp/montreal-forced-aligner/lib/libpython3.6m.so
os.chdir('/kaggle/working')
os.environ['LD_LIBRARY_PATH'] = f'{os.environ["LD_LIBRARY_PATH"]}:/tmp/montreal-forced-aligner/lib/'
os.environ['PATH'] = f'{os.environ["PATH"]}:/tmp/montreal-forced-aligner/bin/'
%%capture
!yes|apt install libgfortran3
!mkdir /tmp/mfa-temp
import json
datapath = '../input/living-audio-irish-speech-corpus/living-audio.json'
with open(datapath) as jsonf:
data = json.load(jsonf)
!mkdir /tmp/living-audio
lexicon_words = set()
with open('../input/living-audio-irish-speech-corpus/lexicon.txt') as lexicon_file:
for line in lexicon_file.readlines():
words = line.split(' ')
lexicon_words.add(words[0])
import shutil
missing_words = set()
for utt in data:
shutil.copyfile(utt['path'], f"/tmp/living-audio/{utt['id']}.wav")
with open(f"/tmp/living-audio/{utt['id']}.txt", 'w') as text:
sentence = utt['sentence']
sentence = sentence.replace('(', '').replace(')', '')
words = []
for word in sentence.split(' '):
if not word in lexicon_words:
missing_words.add(word)
if '-' in word:
if word.startswith('n-') or word.startswith('t-'):
workword = word[2:]
workword.replace('-', ' ')
word = word[0:2] + workword
else:
word = word.replace('-', ' ')
words.append(word)
text.write(' '.join(words))
!mfa_train_and_align -t /tmp/mfa-temp -o ./irish-model /tmp/living-audio ../input/living-audio-irish-speech-corpus/lexicon.txt ./textgrid