Utterance XML to json
Dataset
import xml.etree.ElementTree as ET
class Utterance:
def __init__(self, input, sentences):
self.input = input
self.sentences = sentences
class Sentence:
def __init__(self, input, tokens):
self.input = input
self.tokens = tokens
class Token:
def __init__(self, input, words):
self.input = input
self.words = words
class Word:
def __init__(self, input, source, syllables, pos=""):
self.input = input
self.source = source
self.pos = pos
self.syllables = syllables
if self.syllables is None:
self.syllables = []
def get_phonemes(self):
return " ".join([a.get_phonemes() for a in self.syllables])
def get_clean_word(self):
word = self.input
if word[0:1] in "nt" and word[1:2] in "AÁEÉIÍOÓUÚ":
return word[0:1] + "-" + word[1:].lower()
else:
return word.lower()
class Syllable:
def __init__(self, stress: int = 0, phonemes = None):
self.stress = stress
self.phonemes = phonemes
if self.phonemes is None:
self.phonemes = []
def get_phonemes(self):
return " ".join([a.symbol for a in self.phonemes])
class Phoneme:
def __init__(self, symbol: str = "", end: float = 0.0):
self.symbol = symbol
self.end = end
def from_xml(source):
tree = ET.parse(source)
root = tree.getroot()
if 'input_string' in root.attrib:
input = root.attrib['input_string']
else:
input = ''
sentences = []
for sentence in root.findall('./sentence'):
if 'input_string' in sentence.attrib:
input = sentence.attrib['input_string']
else:
input = ''
tokens = []
for token in sentence.findall('./token'):
if 'input_string' in token.attrib:
input = token.attrib['input_string']
else:
input = ''
words = []
for word in token.findall('./word'):
if 'input_string' in word.attrib:
input = word.attrib['input_string']
else:
input = ""
if 'trans_source' in word.attrib:
source = word.attrib['trans_source']
else:
source = ""
if 'pos' in word.attrib:
pos = word.attrib['pos']
else:
pos = ""
syllables = []
for syllable in word.findall('./syllable'):
phonemes = []
if 'stress' in syllable.attrib:
if syllable.attrib['stress'] == 'None':
stress = 0
else:
stress = int(syllable.attrib['stress'])
else:
stress = 0
for phoneme in syllable.findall('./phoneme'):
if 'symbol' in phoneme.attrib:
symbol = phoneme.attrib['symbol']
else:
symbol = ''
if 'end' in phoneme.attrib:
end = float(phoneme.attrib['end'])
else:
symbol = 0.0
phonemes.append(Phoneme(symbol, end))
syllables.append(Syllable(stress, phonemes))
words.append(Word(input, source, syllables, pos))
tokens.append(Token(input, words))
sentences.append(Sentence(input, tokens))
return Utterance(input, sentences)
def get_dictionary(utt):
prons = {}
for sent in utt.sentences:
for tok in sent.tokens:
for word in tok.words:
if not word.get_clean_word() in prons.keys():
prons[word.get_clean_word()] = set()
prons[word.get_clean_word()].add(word.get_phonemes())
return prons
utt = from_xml("/home/jim/tmp/pmg_ga_co/RCPiarsachALL/xml/MI0001RCPiarsachBairbre_0021.xml")
import json
json.dumps(utt, default=lambda o: o.__dict__)
get_dictionary(utt)
co_pron_replacements = {
"thosaigh": "h o s @",
"féin": "h ee nj",
"haghaidh": "h ai"
}
co_text_word_fixes = {
"RCPiarsachBairbre_0021.xml": [("ar", "ar ar"), ("súl", "súile"), ("máthair", "mothair")],
}
import IPython.display as ipd
ipd.Audio('/home/jim/tmp/pmg_ga_co/RCPiarsachALL/wav44_trimmed/MI0001RCPiarsachBairbre_0021.wav')