Instaloader to ntriples
Grab file locations
import lzma
from pathlib import Path
BASE = Path("/Users/joregan/Playing/instascr")
import json
from datetime import datetime
datestr = datetime.today().strftime('%Y%m%d')
print(data['node'].keys())
testf = '/Users/joregan/Playing/instascr/mollyryanxo/2020-06-06_20-36-58_UTC.json.xz'
jsons = lzma.open(testf).read().decode('utf-8')
data = json.loads(jsons)
username = data['node']['owner']['username']
#for edge in data['node']['edge_sidecar_to_children']['edges']:
# if 'video_url' in edge['node']:
# print(edge['node']['video_url'])
def get_from_data(data):
urls = set()
if 'node' not in data:
print(f"Error reading file")
if 'edge_sidecar_to_children' in data['node']:
for edge in data['node']['edge_sidecar_to_children']['edges']:
urls.add(edge['node']['display_url'])
if 'video_url' in data['node']:
urls.add(data['node']['video_url'])
urls.add(data['node']['display_url'])
if 'video_url' in data['node']:
urls.add(data['node']['video_url'])
return list(urls)
get_from_data(data)
datestr = "20220417"
from pathlib import Path
BASE = Path("/Users/joregan/Playing/instascr")
with open(f"/Users/joregan/Playing/400bcacf78036990182af6bbd7e41a71/instascrape-{datestr}.nt", "w") as outf:
for xzfile in BASE.glob("**/*.xz"):
jsons = lzma.open(xzfile).read().decode('utf-8')
data = json.loads(jsons)
if not 'owner' in data['node']:
print(f"Skipping {str(xzfile)}")
continue
if not 'shortcode' in data['node']:
print(f"Missing shortcode: {str(xzfile)}")
continue
username = data['node']['owner']['username']
short = data['node']['shortcode']
urls = get_from_data(data)
for url in urls:
outf.write(f"<{url}> <http://xmlns.com/foaf/0.1/page> <https://www.instagram.com/p/{short}/?taken-by={username}>\n")