Run Stanza on TIMIT prompts
In Kaggle
PROMPTS = "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/PROMPTS.TXT"
%%capture
!pip install stanza
lines = {}
with open(PROMPTS) as inf:
for line in inf.readlines():
line = line.strip()
if line.startswith(";"):
continue
end = line.rfind(")")
start = line.rfind("(")
sent_id = line[start+1:end]
text = line[:start].strip()
lines[sent_id] = text
import stanza
stanza.download("en", processors="tokenize,pos,lemma,depparse", verbose=False)
nlp = stanza.Pipeline(
lang="en",
processors="tokenize,pos,lemma,depparse",
# Let Stanza decide sentences & tokens
tokenize_pretokenized=False,
no_ssplit=True,
verbose=False
)
trees = {}
for item in lines:
trees[item] = nlp(lines[item])
with open("timit.conll", "w") as outf:
for item in trees:
outf.write(f"# timit_id = {item}\n")
outf.write("{:C}".format(trees[item]))
outf.write("\n\n")