Original on Kaggle

%%capture
!pip install youtube-dl

!youtube-dl -j --flat-playlist "https://www.youtube.com/c/heritageboard/playlists?view=1&sort=dd&shelf_id=0" > rplist.json

!cat rplist.json | awk -F'"url": "' '{print $2}'|awk -F'"' '{print $1}' | while read i;do youtube-dl -j --flat-playlist $i >> pl_videos.json || echo $i >> retry;done

!youtube-dl -j --flat-playlist "https://www.youtube.com/c/heritageboard/videos?view=0&sort=dd&shelf_id=0" > uploads.json

None of the following works

import json
import requests
cc_by = []
other = []
retry = []
seen = []

lic = '"Creative Commons Attribution licence (reuse allowed)"'
def inner(cur_id):
    if cur_id in seen:
        return
    req = requests.get(f"https://www.youtube.com/watch?v={cur_id}")
    if req.status_code != 200:
        retry.append(cur_id)
    if lic in req.text:
        cc_by.append(cur_id)
    else:
        other.append(cur_id)
    seen.append(cur_id)

with open("pl_videos.json") as pl_videos:
    for line in pl_videos.readlines():
        line_data = json.loads(line.strip())
        inner(line_data['id'])
with open("uploads.json") as pl_videos:
    for line in pl_videos.readlines():
        line_data = json.loads(line.strip())
        inner(line_data['id'])

with open('proc.json', 'w') as outfile:
    json.dump({'cc-by': cc_by, 'other': other, 'retry': retry}, outfile)

Instead, this works:

!cat pl_videos.json uploads.json|awk -F'"id": "' '{print $2}'|awk -F'"' '{print $1}' | while read i;do youtube-dl --write-info-json --skip-download -o '%(id)s.%(ext)s' -- "$i" ;done

!cat cc-by-ids.txt |while read i;do youtube-dl -o '%(id)s.%(ext)s' --write-sub --sub-lang sv -- "$i" ;done