from transformers import pipeline
import torch
MODEL = "NbAiLab/whisper-large-sme"
LANG = "fi"
if torch.cuda.is_available():
    device = 0
else:
    device = "cpu"
pipe = pipeline(task="automatic-speech-recognition", model=MODEL, chunk_length_s=30, device=device)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
PAGE = "https://sverigesradio.se/artikel/odda-skearru-bitonsami-ludiiguin"
AUDIOJSON = "https://sverigesradio.se/playerajax/audio?id=8580562&type=publication&publicationid=8580562&quality=medium"
import requests
import json
def get_sverigesradio_audio(page):
    req = requests.get(page)
    if req.status_code != 200:
        return None
    data = json.loads(req.text)
    if "audioUrl" in data:
        return data["audioUrl"]
    return None
!wget {get_sverigesradio_audio(AUDIOJSON)}
from bs4 import BeautifulSoup
def get_audio_id_from_page(page):
    req = requests.get(page)
    if req.status_code != 200:
        return None
    soup = BeautifulSoup(req.text, 'html.parser')
    for elem in soup.findAll("script", {"id": "gtm-metadata"}):
        if "pageId" in elem.text:
            data = json.loads(elem.text)
            return data["pageId"]
    return None
get_audio_id_from_page(PAGE)
'8580562'
def get_audio_from_page(page):
    pageid = get_audio_id_from_page(page)
    if pageid is None:
        return None
    return get_sverigesradio_audio(f"https://sverigesradio.se/playerajax/audio?id={pageid}&type=publication&publicationid={pageid}&quality=medium")
get_audio_from_page(PAGE)