Accept paraphrases
Write a segment from a known paraphrase
LINES = """
2442207080018759021 1 112.18 0.059 Det 1.0 Det cor
2442207080018759021 1 112.32 0.36 återstår 1.0 återstår cor
2442207080018759021 1 112.76 0.339 nämligen 1.0 nämligen cor
2442207080018759021 1 113.24 0.219 två 1.0 två cor
2442207080018759021 1 114.4 0.399 viktiga 1.0 <eps> ins
2442207080018759021 1 115.86 0.42 brister 1.0 brister cor
2442207080018759021 1 116.28 0.0 <eps> 1.0 som del
2442207080018759021 1 116.28 0.0 <eps> 1.0 är del
2442207080018759021 1 116.28 0.0 <eps> 1.0 viktiga del
2442207080018759021 1 116.62 0.099 att 1.0 att cor
2442207080018759021 1 116.82 0.48 åtgärda. 1.0 åtgärda. cor
"""
def accept_paraphrase(lines):
outlines = []
ltext = []
rtext = []
for line in lines.split("\n"):
line = line.strip()
if line == "":
continue
parts = line.split(" ")
if parts[-1] == "cor":
outlines.append(line)
ltext.append(parts[6])
rtext.append(parts[6])
elif parts[-1] == "sub":
ltext.append(parts[4])
rtext.append(parts[6])
parts[6] = parts[4]
parts[7] = "cor"
outlines.append(" ".join(parts))
elif parts[6] == "<eps>":
ltext.append(parts[4])
parts[6] = parts[4]
parts[7] = "cor"
outlines.append(" ".join(parts))
elif parts[4] == "<eps>":
rtext.append(parts[6])
else:
print("Huh?", line)
return outlines, (" ".join(ltext), " ".join(rtext))
def generate_filename(lines):
first = lines[0].split(" ")
last = lines[-1].split(" ")
file_id = first[0]
start = first[2]
last_start = float(last[2])
last_dur = float(last[3])
end = last_start + last_dur
# filename = f"{file_id}_{start}_{end:.2f}.ctmedit"
seg_dur = end - float(start)
filename = f"{file_id}_{start}_{seg_dur:.2f}.ctmedit"
return filename
from pathlib import Path
def write_ctm_segment(outdir, lines):
filename = generate_filename(lines)
dirpath = Path(outdir)
if not dirpath.is_dir():
dirpath.mkdir()
outfile = dirpath / filename
with open(outfile, "w") as f:
for line in lines:
f.write(line + "\n")
def write_text(outdir, filename, text):
outfile = Path(outdir) / filename
with open(outfile, "w") as f:
f.write(text)
a, b = accept_paraphrase(LINES)
OUTDIR = "/Users/joregan/paraphrases"
write_ctm_segment(OUTDIR, a)
filename = generate_filename(a)
write_text(OUTDIR, filename.replace(".ctmedit", ".txt"), b[0])
write_text(OUTDIR, filename.replace(".ctmedit", ".paraphrase"), b[1])
b
LINES = """
2442203250006958021 1 39.58 0.06 Vi 1.0 Vi cor
2442203250006958021 1 39.82 0.1 har 1.0 har cor
2442203250006958021 1 40.0 0.34 lämnat 1.0 lämnat cor
2442203250006958021 1 40.44 0.1 in 1.0 in cor
2442203250006958021 1 40.78 0.119 en 1.0 en cor
2442203250006958021 1 42.1 1.479 motivreservation, 1.0 motivreservation, cor
2442203250006958021 1 43.64 0.079 det 1.0 det cor
2442203250006958021 1 43.74 0.119 vill 1.0 vill cor
2442203250006958021 1 43.92 0.179 säga 1.0 säga cor
2442203250006958021 1 44.16 0.1 att 1.0 att cor
2442203250006958021 1 44.3 0.08 den 1.0 <eps> ins
2442203250006958021 1 44.5 0.38 texten 1.0 <eps> ins
2442203250006958021 1 44.94 0.1 som 1.0 <eps> ins
2442203250006958021 1 45.12 0.2 kommer 1.0 <eps> ins
2442203250006958021 1 45.4 0.34 före 1.0 vi sub
2442203250006958021 1 46.38 0.159 vill 1.0 vill cor
2442203250006958021 1 46.56 0.039 vi 1.0 <eps> ins
2442203250006958021 1 46.7 0.039 ha 1.0 ha cor
2442203250006958021 1 46.739 0.0 <eps> 1.0 texten del
2442203250006958021 1 47.4 0.179 lite 1.0 lite cor
2442203250006958021 1 47.64 0.38 förändrad 1.0 förändrad. cor
"""
a, b = accept_paraphrase(LINES)
a, b = accept_paraphrase(LINES)
generate_filename(a)