sample = """
<?xml version="1.0" encoding="utf-8"?>
<utterance input_string="">
<sentence input_string="">
<token input_string="SILENCE_TOKEN">
<word input_string="SILENCE_TOKEN" trans_source="src" trans_output_format="final">
<syllable >
<phoneme symbol="sil" end="1.19"/>
</syllable>
</word>
</token>
</sentence>
</utterance>
"""

import xml.etree.ElementTree as ET

class Utterance:
    def __init__(self, input, sentences):
        self.input = input
        self.sentences = sentences

class Sentence:
    def __init__(self, input, tokens):
        self.input = input
        self.tokens = tokens

class Token:
    def __init__(self, input, words):
        self.input = input
        self.words = words

class Word:
    def __init__(self, input, source, syllables):
        self.input = input
        self.source = source
        self.syllables = syllables
        if self.syllables is None:
            self.syllables = []

    def get_phonemes(self):
        return " ".join([a.get_phonemes() for a in self.syllables])

    def get_clean_word(self):
        word = self.input
        if word[0:1] in "nt" and word[1:2] in "AÁEÉIÍOÓUÚ":
            return word[0:1] + "-" + word[1:].lower()
        else:
            return word.lower()

class Syllable:
    def __init__(self, stress: int = 0, phonemes = None):
        self.stress = stress
        self.phonemes = phonemes
        if self.phonemes is None:
            self.phonemes = []

    def get_phonemes(self):
        return " ".join([a.symbol for a in self.phonemes])

class Phoneme:
    def __init__(self, symbol: str = "", end: float = 0.0):
        self.symbol = symbol
        self.end = end

import io
sio = io.StringIO(sample.strip())

def from_xml(source):
    tree = ET.parse(source)
    root = tree.getroot()
    if 'input_string' in root.attrib:
        input = root.attrib['input_string']
    else:
        input = ''
    sentences = []
    for sentence in root.findall('./sentence'):
        if 'input_string' in sentence.attrib:
            input = sentence.attrib['input_string']
        else:
            input = ''
        tokens = []
        for token in sentence.findall('./token'):
            if 'input_string' in token.attrib:
                input = token.attrib['input_string']
            else:
                input = ''
            words = []
            for word in token.findall('./word'):
                if 'input_string' in word.attrib:
                    input = word.attrib['input_string']
                else:
                    input = ""
                if 'trans_source' in word.attrib:
                    source = word.attrib['trans_source']
                else:
                    source = ""
                syllables = []
                for syllable in word.findall('./syllable'):
                    phonemes = []
                    if 'stress' in syllable.attrib:
                        if syllable.attrib['stress'] == 'None':
                            stress = 0
                        else:
                            stress = int(syllable.attrib['stress'])
                    else:
                        stress = 0
                    for phoneme in syllable.findall('./phoneme'):
                        if 'symbol' in phoneme.attrib:
                            symbol = phoneme.attrib['symbol']
                        else:
                            symbol = ''
                        if 'end' in phoneme.attrib:
                            end = float(phoneme.attrib['end'])
                        else:
                            symbol = 0.0
                        phonemes.append(Phoneme(symbol, end))
                    syllables.append(Syllable(stress, phonemes))
                words.append(Word(input, source, syllables))
            tokens.append(Token(input, words))
        sentences.append(Sentence(input, tokens))
    return Utterance(input, sentences)

utt = from_xml(sio)

import json
json.dumps(utt, default=lambda o: o.__dict__)

'{"input": "SILENCE_TOKEN", "sentences": [{"input": "SILENCE_TOKEN", "tokens": [{"input": "SILENCE_TOKEN", "words": [{"input": "SILENCE_TOKEN", "source": "src", "syllables": [{"stress": 0, "phonemes": [{"symbol": "sil", "end": 1.19}]}]}]}]}]}'

for sent in utt.sentences:
    for tok in sent.tokens:
        for word in tok.words:
            print(f'{word.get_clean_word()} {word.get_phonemes()}')

silence_token sil