Filter Riksdag by year
Filtering to only videos from 2017 and 2018
import json
from pathlib import Path
_API_DIR = Path("/Users/joregan/riksdag/riksdag-api-out")
def endswith_list(text, items):
for it in items:
if text.endswith(it):
return True
return False
def viddata_get_single_stream(videodata, hires=True):
videos = []
if videodata is None:
return []
if 'streams' not in videodata:
#raise Exception("videodata is missing 'streams'")
return []
if videodata['streams'] is None:
return []
if 'files' not in videodata['streams']:
#raise Exception("videodata['streams'] is missing 'files'")
return []
if type(videodata['streams']['files']) == list:
for vfile in videodata['streams']['files']:
for bw in vfile['bandwidth']:
if hires and bw['name'] == 'Hög kvalitet':
videos.append(bw['downloadurl'])
elif not hires and bw['name'] == 'Låg kvalitet':
videos.append(bw['downloadurl'])
else:
#raise Exception(f"Expected a list, got {type(videodata['streams']['files'])}")
return []
return videos
def viddata_get_streams(videodata, hires=True):
output = []
if 'videodata' not in videodata:
#raise Exception("'videodata' missing")
return []
for vdata in videodata['videodata']:
output += viddata_get_single_stream(vdata, hires)
return output
def viddata_from_file(videofile, hires=True):
with open(videofile) as jsonf:
data = json.load(jsonf)
return viddata_get_streams(data, hires)
def json_matches_years(filename, years):
ret_val = False
with open(filename) as f:
data = json.load(f)
if not "videodata" in data:
#raise Exception(f"File {filename} missing key 'videodata'")
return False
videodata = data["videodata"]
if videodata is None:
print(f"Empty videodata: {filename}")
return False
for vdata in videodata:
if vdata is None:
print(f"Empty videodata: {filename}")
return False
if "debatedate" in vdata and vdata["debatedate"] is not None and vdata["debatedate"] != "":
date = vdata["debatedate"]
if endswith_list(date.strip(), years):
return True
return False
matches = []
for file in _API_DIR.glob("H*"):
if json_matches_years(file, ["2017", "2018"]):
matches.append(str(file))
with open("2017-2018.txt", "w") as outf:
for m in matches:
outf.write(m + "\n")
with open("2017-2018-videos.txt", "w") as outf:
for file in _API_DIR.glob("H*"):
if json_matches_years(file, ["2017", "2018"]):
videos = viddata_from_file(file)
vidsout = "\t".join(videos)
outf.write(f"{file.stem}\t{vidsout}\n")
def get_speaker_data(data):
output = []
if not "videodata" in data or data["videodata"] is None:
#raise Exception(f"File {filename} missing key 'videodata'")
return []
for vdata in data["videodata"]:
if vdata is not None and "speakers" in vdata and vdata["speakers"] is not None:
for speaker in vdata["speakers"]:
output.append(speaker)
return output
with open("/Users/joregan/riksdag/riksdag-api-out/H501CU20") as inp:
vdata = json.load(inp)
speakers = get_speaker_data(vdata)
sample_speech = speakers[0]["anftext"]
from bs4 import BeautifulSoup
!pip install mosestokenizer
from mosestokenizer import MosesSentenceSplitter
splitter = MosesSentenceSplitter("sv")
def split_text(sample_speech, by_paras=False):
soup = BeautifulSoup(sample_speech, 'html.parser')
paras = []
for para in soup.findAll("p"):
if not para.text.strip().startswith("STYLEREF Kantrubrik"):
paras.append(para.text.strip())
splitparas = [splitter([p]) for p in paras if p.strip() != ""]
if by_paras:
return splitparas
else:
flattened = [sent for sents in splitparas for sent in sents]
return flattened
with open("2017-2018-text.txt", "w") as outf:
for file in _API_DIR.glob("H*"):
if json_matches_years(file, ["2017", "2018"]):
with open(file) as inp:
vdata = json.load(inp)
speakers = get_speaker_data(vdata)
for speaker in speakers:
if "anftext" in speaker:
text = split_text(speaker["anftext"])
for line in text:
outf.write(line + "\n")
with open("all-text.txt", "w") as outf:
for file in _API_DIR.glob("H*"):
with open(file) as inp:
vdata = json.load(inp)
speakers = get_speaker_data(vdata)
for speaker in speakers:
if "anftext" in speaker:
text = split_text(speaker["anftext"])
for line in text:
outf.write(line + "\n")