Extract data for the ARS project, take 2
Not much use to anyone else, but I may need to repeat it
!grep -i '\(terror\|teror\)' ~/rd_ctm_edit/* | awk -F':' '{print $1}'|sort|uniq > /tmp/terror_files
FILES=!cat /tmp/terror_files | shuf | head -n 5
UPPER_BOUND = 60 * 7
LOWER_BOUND = 60 * 6
import random
random.randrange(LOWER_BOUND, UPPER_BOUND)
def slurp(filename):
lines = []
with open(filename) as inputfile:
for line in inputfile.readlines():
if line.strip() != "":
lines.append(line.strip())
return lines
from pathlib import Path
BASEPATH = Path("/home/joregan/rd_ctm_edit")
testing = slurp(str(BASEPATH / "H9C120210930fs"))
def get_terror_lines(lines):
outlines = []
for line in lines:
if "terror" in line.lower() or "teror" in line.lower():
outlines.append(line)
return outlines
def file_upper_bound(lines):
last = lines[-1]
parts = last.split(" ")
return float(parts[2]) + float(parts[3])
def file_upper_bound_start(lines):
last = lines[-1]
parts = last.split(" ")
return float(parts[2])
def get_random_mention(lines):
terror_lines = get_terror_lines(lines)
if len(terror_lines) == 1:
return terror_lines[0]
randn = random.randrange(0, len(terror_lines) - 1)
return terror_lines[randn]
def random_time(lines):
random_line = get_random_mention(lines)
parts = random_line.split(" ")
start_time = float(parts[2])
return start_time, random_line
def time_difference(line1, line2):
parts1 = line1.split(" ")
parts2 = line2.split(" ")
start = float(parts1[2])
end = float(parts2[2]) + float(parts2[3])
return end - start
def do_the_thing(lines):
upper = file_upper_bound_start(lines)
focal = upper
line = ""
while focal >= upper:
focal, line = random_time(lines)
line_index = lines.index(line)
idx_up = idx_down = line_index
while idx_up < len(lines) and idx_down > 0:
if time_difference(lines[idx_down], lines[idx_up]) < UPPER_BOUND:
if idx_up < len(lines):
idx_up += 1
if idx_down > 0:
idx_down -= 1
elif time_difference(lines[idx_down], lines[idx_up]) > UPPER_BOUND:
idx_up -= 1
idx_down += 1
break
else:
break
return lines[idx_down:idx_up]
a = do_the_thing(testing)
time_difference(a[0], a[-1])
segments = []
with open("/tmp/run_ffmpeg1.sh", "w") as runsh, open("/tmp/segments.ctm", "w") as segctm:
for filename in FILES:
lines = slurp(filename)
seg = do_the_thing(lines)
segments.append(seg)
# stem = Path(filename).stem
parts = lines[0].split(" ")
vidid = parts[0]
runsh.write(f"ffmpeg -i /sbtal/riksdag-video/{vidid}_480p.mp4 -acodec pcm_s16le -ac 1 -ar 16000 /tmp/{vidid}.wav\n")
for segline in seg:
segctm.write(segline + "\n")
segctm.write("\n")
!bash /tmp/run_ffmpeg1.sh
from pydub import AudioSegment
parameters=["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]
for seg in segments:
first = seg[0].split(" ")
last = seg[-1].split(" ")
vidid = first[0]
start = int(float(first[2]) * 1000)
end = int(float(last[2]) + float(last[3]) * 1000)
wavaudio = AudioSegment.from_wav(f"/tmp/{vidid}.wav")
sect = wavaudio[start:end]
sect.export(f"/tmp/SEG_{vidid}.wav", format="wav", parameters=parameters)