Load Sámi Whisper model
Also basic pieces for scraping Sveriges Radio pages
from transformers import pipeline
import torch
MODEL = "NbAiLab/whisper-large-sme"
LANG = "fi"
if torch.cuda.is_available():
device = 0
else:
device = "cpu"
pipe = pipeline(task="automatic-speech-recognition", model=MODEL, chunk_length_s=30, device=device)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
PAGE = "https://sverigesradio.se/artikel/odda-skearru-bitonsami-ludiiguin"
AUDIOJSON = "https://sverigesradio.se/playerajax/audio?id=8580562&type=publication&publicationid=8580562&quality=medium"
import requests
import json
def get_sverigesradio_audio(page):
req = requests.get(page)
if req.status_code != 200:
return None
data = json.loads(req.text)
if "audioUrl" in data:
return data["audioUrl"]
return None
!wget {get_sverigesradio_audio(AUDIOJSON)}
from bs4 import BeautifulSoup
def get_audio_id_from_page(page):
req = requests.get(page)
if req.status_code != 200:
return None
soup = BeautifulSoup(req.text, 'html.parser')
for elem in soup.findAll("script", {"id": "gtm-metadata"}):
if "pageId" in elem.text:
data = json.loads(elem.text)
return data["pageId"]
return None
get_audio_id_from_page(PAGE)
def get_audio_from_page(page):
pageid = get_audio_id_from_page(page)
if pageid is None:
return None
return get_sverigesradio_audio(f"https://sverigesradio.se/playerajax/audio?id={pageid}&type=publication&publicationid={pageid}&quality=medium")
get_audio_from_page(PAGE)