sample = """
<?xml version="1.0" encoding="utf-8"?>
<utterance input_string="">
<sentence input_string="">
<token input_string="SILENCE_TOKEN">
<word input_string="SILENCE_TOKEN" trans_source="src" trans_output_format="final">
<syllable >
<phoneme symbol="sil" end="1.19"/>
</syllable>
</word>
</token>
</sentence>
</utterance>
"""

import xml.etree.ElementTree as ET

class Utterance:
  def __init__(self, input, sentences):
    self.input = input
    self.sentences = sentences

class Sentence:
  def __init__(self, input, tokens):
    self.input = input
    self.tokens = tokens

class Token:
  def __init__(self, input, words):
    self.input = input
    self.words = words

class Word:
  def __init__(self, input, source, syllables):
    self.input = input
    self.source = source
    self.syllables = syllables
    if self.syllables is None:
      self.syllables = []

class Syllable:
  def __init__(self, stress: int = 0, phonemes = None):
    self.stress = stress
    self.phonemes = phonemes
    if self.phonemes is None:
      self.phonemes = []

class Phoneme:
  def __init__(self, symbol: str = "", end: float = 0.0):
    self.symbol = symbol
    self.end = end

import io
sio = io.StringIO(sample.strip())

def from_xml(source):
  tree = ET.parse(source)
  root = tree.getroot()
  if 'input_string' in root.attrib:
    input = root.attrib['input_string']
  else:
    input = ''
  sentences = []
  for sentence in root.findall('./sentence'):
    if 'input_string' in sentence.attrib:
      input = sentence.attrib['input_string']
    else:
      input = ''
    tokens = []
    for token in sentence.findall('./token'):
      if 'input_string' in token.attrib:
        input = token.attrib['input_string']
      else:
        input = ''
      words = []
      for word in token.findall('./word'):
        if 'input_string' in word.attrib:
          input = word.attrib['input_string']
        else:
          input = ""
        if 'trans_source' in word.attrib:
          source = word.attrib['trans_source']
        else:
          source = ""
        syllables = []
        for syllable in word.findall('./syllable'):
          phonemes = []
          if 'stress' in syllable.attrib:
            stress = int(syllable.attrib['stress'])
          else:
            stress = 0
          for phoneme in syllable.findall('./phoneme'):
            if 'symbol' in phoneme.attrib:
              symbol = phoneme.attrib['symbol']
            else:
              symbol = ''
            if 'end' in phoneme.attrib:
              end = float(phoneme.attrib['end'])
            else:
              symbol = 0.0
            phonemes.append(Phoneme(symbol, end))
          syllables.append(Syllable(stress, phonemes))
        words.append(Word(input, source, syllables))
      tokens.append(Token(input, words))
    sentences.append(Sentence(input, tokens))
  return Utterance(input, sentences)

utt = from_xml(sio)

import json
json.dumps(utt, default=lambda o: o.__dict__)

'{"input": "SILENCE_TOKEN", "sentences": [{"input": "SILENCE_TOKEN", "tokens": [{"input": "SILENCE_TOKEN", "words": [{"input": "SILENCE_TOKEN", "source": "src", "syllables": [{"stress": 0, "phonemes": [{"symbol": "sil", "end": 1.19}]}]}]}]}]}'