original

PROMPTS = "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/PROMPTS.TXT"

%%capture
!pip install stanza

lines = {}

with open(PROMPTS) as inf:
    for line in inf.readlines():
        line = line.strip()
        if line.startswith(";"):
            continue
        end = line.rfind(")")
        start = line.rfind("(")
        sent_id = line[start+1:end]
        text = line[:start].strip()
        lines[sent_id] = text

import stanza
stanza.download("en", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="en",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=False,
    no_ssplit=True,
    verbose=False
)

trees = {}

for item in lines:
    trees[item] = nlp(lines[item])

with open("timit.conll", "w") as outf:
    for item in trees:
        outf.write(f"# timit_id = {item}\n")
        outf.write("{:C}".format(trees[item]))
        outf.write("\n\n")