%%capture
!wget https://ia800700.us.archive.org/6/items/ga.ie.cll.48000.tar/ga.ie.cll.48000.tar.gz

%%capture
!wget https://raw.githubusercontent.com/Idlak/Living-Audio-Dataset/master/ga/text.xml

%%capture
!tar zxvf ga.ie.cll.48000.tar.gz
!rm ga.ie.cll.48000.tar.gz

%%capture
!pip install bs4

from bs4 import BeautifulSoup
import unicodedata
soup = BeautifulSoup(open('text.xml').read(), 'lxml')
dataset = list()
for entry in soup.find_all('fileid'):
    current = dict()
    current['id'] = entry['id']
    current['text'] = unicodedata.normalize('NFC', entry.text.strip())
    dataset.append(current)

!rm text.xml

def is_upper_vowel(letter):
  if letter in ['A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú']:
    return True
  else:
    return False

def irish_lower(word):
  if len(word) > 1 and word[0] in ['n', 't'] and is_upper_vowel(word[1]):
    return word[0] + '-' + word[1:].lower()
  else:
    return word.lower()

def irish_lower_sentence(sentence):
  return " ".join([irish_lower(w) for w in sentence.split(" ")])

import re
hyphens = 'cll_z0001_713 cll_z0001_804 cll_z0002_069 cll_z0002_296 cll_z0002_448 cll_z0002_481 cll_z0002_484 cll_z0002_495'.split(' ')
for entry in dataset:
    tmp = entry['text']
    tmp = re.sub(' \- ', ' ', tmp)
    tmp = re.sub(' – ', ' ', tmp)
    tmp = re.sub('[‘“”\"\.\?!,–—;:]', '', tmp)
    if entry['id'] in hyphens:
        tmp = re.sub('\'', '', tmp)
    entry['sentence'] = irish_lower_sentence(tmp)

for entry in dataset:
    entry['speaker'] = 'cll'
    entry['accent'] = 'dublin'
    entry['gender'] = 'male'
    entry['path'] = '../input/living-audio-irish-speech-corpus/48000_orig/{}.wav'.format(entry['id'])

import json
datasetjson = json.dumps(dataset)
jsonf = open("living-audio.json", "w")
jsonf.write(datasetjson)
jsonf.close()

!wget https://raw.githubusercontent.com/Idlak/idlak/master/idlak-data/ga/ie/lexicon-default.xml

--2021-04-20 21:54:40--  https://raw.githubusercontent.com/Idlak/idlak/master/idlak-data/ga/ie/lexicon-default.xml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405337 (396K) [text/plain]
Saving to: ‘lexicon-default.xml’

lexicon-default.xml 100%[===================>] 395.84K  --.-KB/s    in 0.03s   

2021-04-20 21:54:40 (14.8 MB/s) - ‘lexicon-default.xml’ saved [405337/405337]

from bs4 import BeautifulSoup
import unicodedata
soup = BeautifulSoup(open('lexicon-default.xml').read(), 'lxml')
lexicon = []
for entry in soup.find_all('lex'):
    current = {}
    current['pron'] = entry['pron']
    current['text'] = unicodedata.normalize('NFC', entry.text.strip())
    lexicon.append(current)

lexiconjson = json.dumps(lexicon)
jsonf = open("ga-lexicon.json", "w")
jsonf.write(lexiconjson)
jsonf.close()

!rm lexicon-default.xml

with open('lexicon.txt', 'w') as lextxt:
    for lex in lexicon:
        text = lex['text']
        cleaned = lex['pron'].replace('0', '').replace('1', '').replace('2', '')
        lextxt.write(f'{text} {cleaned}\n')