import lzma
from pathlib import Path
BASE = Path("/Users/joregan/Playing/instascr")
import json
from datetime import datetime
datestr = datetime.today().strftime('%Y%m%d')
print(data['node'].keys())
dict_keys(['__typename', 'id', 'gating_info', 'fact_check_overall_rating', 'fact_check_information', 'media_overlay_info', 'sensitivity_friction_info', 'sharing_friction_info', 'dimensions', 'display_url', 'display_resources', 'is_video', 'media_preview', 'tracking_token', 'edge_media_to_tagged_user', 'accessibility_caption', 'edge_media_to_caption', 'shortcode', 'edge_media_to_comment', 'edge_media_to_sponsor_user', 'comments_disabled', 'taken_at_timestamp', 'edge_media_preview_like', 'owner', 'location', 'viewer_has_liked', 'viewer_has_saved', 'viewer_has_saved_to_collection', 'viewer_in_photo_of_you', 'viewer_can_reshare', 'thumbnail_src', 'thumbnail_resources', 'edge_sidecar_to_children'])
testf = '/Users/joregan/Playing/instascr/mollyryanxo/2020-06-06_20-36-58_UTC.json.xz'
jsons = lzma.open(testf).read().decode('utf-8')
data = json.loads(jsons)
username = data['node']['owner']['username']
#for edge in data['node']['edge_sidecar_to_children']['edges']:
#    if 'video_url' in edge['node']:
#        print(edge['node']['video_url'])
def get_from_data(data):
    urls = set()
    if 'node' not in data:
        print(f"Error reading file")
    if 'edge_sidecar_to_children' in data['node']:
        for edge in data['node']['edge_sidecar_to_children']['edges']:
            urls.add(edge['node']['display_url'])
            if 'video_url' in data['node']:
                urls.add(data['node']['video_url'])            
    urls.add(data['node']['display_url'])
    if 'video_url' in data['node']:
        urls.add(data['node']['video_url'])
    return list(urls)
get_from_data(data)
datestr = "20220417"
from pathlib import Path
BASE = Path("/Users/joregan/Playing/instascr")
with open(f"/Users/joregan/Playing/400bcacf78036990182af6bbd7e41a71/instascrape-{datestr}.nt", "w") as outf:
    for xzfile in BASE.glob("**/*.xz"):
        jsons = lzma.open(xzfile).read().decode('utf-8')
        data = json.loads(jsons)
        if not 'owner' in data['node']:
            print(f"Skipping {str(xzfile)}")
            continue
        if not 'shortcode' in data['node']:
            print(f"Missing shortcode: {str(xzfile)}")
            continue
        username = data['node']['owner']['username']
        short = data['node']['shortcode']
        urls = get_from_data(data)
        for url in urls:
            outf.write(f"<{url}> <http://xmlns.com/foaf/0.1/page> <https://www.instagram.com/p/{short}/?taken-by={username}>\n")