Living Audio Irish
TTS test corpus for Irish from IDLAK
%%capture
!wget https://ia800700.us.archive.org/6/items/ga.ie.cll.48000.tar/ga.ie.cll.48000.tar.gz
%%capture
!wget https://raw.githubusercontent.com/Idlak/Living-Audio-Dataset/master/ga/text.xml
%%capture
!tar zxvf ga.ie.cll.48000.tar.gz
!rm ga.ie.cll.48000.tar.gz
%%capture
!pip install bs4
from bs4 import BeautifulSoup
import unicodedata
soup = BeautifulSoup(open('text.xml').read(), 'lxml')
dataset = list()
for entry in soup.find_all('fileid'):
current = dict()
current['id'] = entry['id']
current['text'] = unicodedata.normalize('NFC', entry.text.strip())
dataset.append(current)
!rm text.xml
def is_upper_vowel(letter):
if letter in ['A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú']:
return True
else:
return False
def irish_lower(word):
if len(word) > 1 and word[0] in ['n', 't'] and is_upper_vowel(word[1]):
return word[0] + '-' + word[1:].lower()
else:
return word.lower()
def irish_lower_sentence(sentence):
return " ".join([irish_lower(w) for w in sentence.split(" ")])
import re
hyphens = 'cll_z0001_713 cll_z0001_804 cll_z0002_069 cll_z0002_296 cll_z0002_448 cll_z0002_481 cll_z0002_484 cll_z0002_495'.split(' ')
for entry in dataset:
tmp = entry['text']
tmp = re.sub(' \- ', ' ', tmp)
tmp = re.sub(' – ', ' ', tmp)
tmp = re.sub('[‘“”\"\.\?!,–—;:]', '', tmp)
if entry['id'] in hyphens:
tmp = re.sub('\'', '', tmp)
entry['sentence'] = irish_lower_sentence(tmp)
for entry in dataset:
entry['speaker'] = 'cll'
entry['accent'] = 'dublin'
entry['gender'] = 'male'
entry['path'] = '../input/living-audio-irish-speech-corpus/48000_orig/{}.wav'.format(entry['id'])
import json
datasetjson = json.dumps(dataset)
jsonf = open("living-audio.json", "w")
jsonf.write(datasetjson)
jsonf.close()
!wget https://raw.githubusercontent.com/Idlak/idlak/master/idlak-data/ga/ie/lexicon-default.xml
from bs4 import BeautifulSoup
import unicodedata
soup = BeautifulSoup(open('lexicon-default.xml').read(), 'lxml')
lexicon = []
for entry in soup.find_all('lex'):
current = {}
current['pron'] = entry['pron']
current['text'] = unicodedata.normalize('NFC', entry.text.strip())
lexicon.append(current)
lexiconjson = json.dumps(lexicon)
jsonf = open("ga-lexicon.json", "w")
jsonf.write(lexiconjson)
jsonf.close()
!rm lexicon-default.xml
with open('lexicon.txt', 'w') as lextxt:
for lex in lexicon:
text = lex['text']
cleaned = lex['pron'].replace('0', '').replace('1', '').replace('2', '')
lextxt.write(f'{text} {cleaned}\n')