import json
from pathlib import Path
_API_DIR = Path("/Users/joregan/riksdag/riksdag-api-out")

def endswith_list(text, items):
    for it in items:
        if text.endswith(it):
            return True
    return False

def viddata_get_single_stream(videodata, hires=True):
    videos = []
    if videodata is None:
        return []
    if 'streams' not in videodata:
        #raise Exception("videodata is missing 'streams'")
        return []
    if videodata['streams'] is None:
        return []
    if 'files' not in videodata['streams']:
        #raise Exception("videodata['streams'] is missing 'files'")
        return []
    if type(videodata['streams']['files']) == list:
        for vfile in videodata['streams']['files']:
            for bw in vfile['bandwidth']:
                if hires and bw['name'] == 'Hög kvalitet':
                    videos.append(bw['downloadurl'])
                elif not hires and bw['name'] == 'Låg kvalitet':
                    videos.append(bw['downloadurl'])
    else:
        #raise Exception(f"Expected a list, got {type(videodata['streams']['files'])}")
        return []
    return videos

def viddata_get_streams(videodata, hires=True):
    output = []
    if 'videodata' not in videodata:
        #raise Exception("'videodata' missing")
        return []
    for vdata in videodata['videodata']:
        output += viddata_get_single_stream(vdata, hires)
    return output

def viddata_from_file(videofile, hires=True):
    with open(videofile) as jsonf:
        data = json.load(jsonf)
        return viddata_get_streams(data, hires)

def json_matches_years(filename, years):
    ret_val = False
    with open(filename) as f:
        data = json.load(f)
        if not "videodata" in data:
            #raise Exception(f"File {filename} missing key 'videodata'")
            return False
        videodata = data["videodata"]
        if videodata is None:
            print(f"Empty videodata: {filename}")
            return False
        for vdata in videodata:
            if vdata is None:
                print(f"Empty videodata: {filename}")
                return False
            if "debatedate" in vdata and vdata["debatedate"] is not None and vdata["debatedate"] != "":
                date = vdata["debatedate"]
                if endswith_list(date.strip(), years):
                    return True
    return False

matches = []
for file in _API_DIR.glob("H*"):
    if json_matches_years(file, ["2017", "2018"]):
        matches.append(str(file))

Empty videodata: /Users/joregan/riksdag/riksdag-api-out/H8C120210621zz

with open("2017-2018.txt", "w") as outf:
    for m in matches:
        outf.write(m + "\n")

with open("2017-2018-videos.txt", "w") as outf:
    for file in _API_DIR.glob("H*"):
        if json_matches_years(file, ["2017", "2018"]):
            videos = viddata_from_file(file)
            vidsout = "\t".join(videos)
            outf.write(f"{file.stem}\t{vidsout}\n")

Empty videodata: /Users/joregan/riksdag/riksdag-api-out/H8C120210621zz

def get_speaker_data(data):
    output = []
    if not "videodata" in data or data["videodata"] is None:
        #raise Exception(f"File {filename} missing key 'videodata'")
        return []
    for vdata in data["videodata"]:
        if vdata is not None and "speakers" in vdata and vdata["speakers"] is not None:
            for speaker in vdata["speakers"]:
                output.append(speaker)
    return output

with open("/Users/joregan/riksdag/riksdag-api-out/H501CU20") as inp:
    vdata = json.load(inp)
    speakers = get_speaker_data(vdata)
sample_speech = speakers[0]["anftext"]

from bs4 import BeautifulSoup

!pip install mosestokenizer

from mosestokenizer import MosesSentenceSplitter
splitter = MosesSentenceSplitter("sv")

stdbuf was not found; communication with perl may hang due to stdio buffering.

def split_text(sample_speech, by_paras=False):
    soup = BeautifulSoup(sample_speech, 'html.parser')

    paras = []
    for para in soup.findAll("p"):
        if not para.text.strip().startswith("STYLEREF Kantrubrik"):
            paras.append(para.text.strip())

    splitparas = [splitter([p]) for p in paras if p.strip() != ""]
    if by_paras:
        return splitparas
    else:
        flattened = [sent for sents in splitparas for sent in sents]
        return flattened

with open("2017-2018-text.txt", "w") as outf:
    for file in _API_DIR.glob("H*"):
        if json_matches_years(file, ["2017", "2018"]):
            with open(file) as inp:
                vdata = json.load(inp)
                speakers = get_speaker_data(vdata)
                for speaker in speakers:
                    if "anftext" in speaker:
                        text = split_text(speaker["anftext"])
                        for line in text:
                            outf.write(line + "\n")

Empty videodata: /Users/joregan/riksdag/riksdag-api-out/H8C120210621zz

with open("all-text.txt", "w") as outf:
    for file in _API_DIR.glob("H*"):
        with open(file) as inp:
            vdata = json.load(inp)
            speakers = get_speaker_data(vdata)
            for speaker in speakers:
                if "anftext" in speaker:
                    text = split_text(speaker["anftext"])
                    for line in text:
                        outf.write(line + "\n")