Silero VAD on audio with clean silences
Does not work out so well
%%capture
!wget https://wolnelektury.pl/katalog/zip/mp3/z-wichrow-i-hal-z-tatr-krzak-dzikiej-rozy-w-ciemnych-smreczy.zip
%%capture
!unzip z-wichrow-i-hal-z-tatr-krzak-dzikiej-rozy-w-ciemnych-smreczy.zip
%%capture
!ffmpeg -i z-wichrow-i-hal-z-tatr-krzak-dzikiej-rozy-w-ciemnych-smreczy_000.mp3 -acodec pcm_s16le -ac 1 -ar 16000 test.wav
import torch
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True)
%%capture
%pip install pydub
start = int(5.32 * 1000)
end = int(9.1 * 1000)
from pydub import AudioSegment
audio = AudioSegment.from_wav("test.wav")
seg = audio[start:end]
import IPython.display as ipd
len(seg.raw_data)
assert seg.sample_width == 2
seg.export("seg.wav", format="wav")
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
wav = read_audio("seg.wav")
!ffprobe -i seg.wav
seg.channels
import io
import soundfile as sf
data, samplerate = sf.read(io.BytesIO(seg.raw_data), format="RAW", subtype="PCM_16", samplerate=16000, channels=1)
sf.write("seg2.wav", data, 16000)
ipd.Audio("seg2.wav")
try2 = read_audio('test.wav', sampling_rate=16000)
speech_timestamps = get_speech_timestamps(try2, model, sampling_rate=16_000)
speech_timestamps
def get_raw_data(raw_data):
return sf.read(io.BytesIO(raw_data), format="RAW", subtype="PCM_16", samplerate=16000, channels=1)
def get_start_end(audio, start, end):
if type(start) == float:
st = int(start * 1000)
else:
st = start
if type(end) == float:
et = int(end * 1000)
else:
et = end
seg = audio[st:et]
raw, _ = sf.read(io.BytesIO(seg.raw_data), format="RAW", subtype="PCM_16", samplerate=16000, channels=1)
return raw
ipd.Audio(get_start_end(audio, 606, 1657), rate=16000)
!ffmpeg -i test.wav -af silencedetect=d=0.1 -f null -
from pydub.silence import detect_nonsilent
detect_nonsilent(audio, 100, -60)