%%capture
!pip install auditok
%%capture
!yes|apt install python3-pyaudio
%%capture
!pip install youtube-dl
!youtube-dl https://www.youtube.com/watch?v=D44-x6PTd_Q
[youtube] D44-x6PTd_Q: Downloading webpage
[youtube] D44-x6PTd_Q: Downloading MPD manifest
WARNING: Requested formats are incompatible for merge and will be merged into mkv.
[dashsegments] Total fragments: 22
[download] Destination: Sraith Pictiúr - An Ghaeilge Seoid luachmhar-D44-x6PTd_Q.f247.webm
[download]  22.7% of ~12.60MiB at  5.49MiB/s ETA 00:13[download] Got server HTTP error: HTTP Error 404: Not Found. Retrying fragment 6 (attempt 1 of 10)...
[download] 100% of 15.12MiB in 00:18
[download] Destination: Sraith Pictiúr - An Ghaeilge Seoid luachmhar-D44-x6PTd_Q.f140.m4a
[download] 100% of 1.73MiB in 00:00
[ffmpeg] Merging formats into "Sraith Pictiúr - An Ghaeilge Seoid luachmhar-D44-x6PTd_Q.mkv"
Deleting original file Sraith Pictiúr - An Ghaeilge Seoid luachmhar-D44-x6PTd_Q.f247.webm (pass -k to keep)
Deleting original file Sraith Pictiúr - An Ghaeilge Seoid luachmhar-D44-x6PTd_Q.f140.m4a (pass -k to keep)
import auditok
input = 'Sraith Pictiúr - An Ghaeilge Seoid luachmhar-D44-x6PTd_Q.mkv'
audio_regions = auditok.split(
    input,
    min_dur=1,
    max_dur=10,
    max_silence=0.9,
    energy_threshold=20
)
for i, r in enumerate(audio_regions):
    print("Region {i}: {r.meta.start:.3f}s -- {r.meta.end:.3f}s".format(i=i, r=r))
Region 0: 0.300s -- 6.550s
Region 1: 7.450s -- 12.950s
Region 2: 13.150s -- 15.700s
Region 3: 15.900s -- 19.200s
Region 4: 19.350s -- 29.350s
Region 5: 29.700s -- 34.200s
Region 6: 34.300s -- 38.600s
Region 7: 39.000s -- 43.650s
Region 8: 43.700s -- 46.550s
Region 9: 46.750s -- 49.500s
Region 10: 49.550s -- 52.950s
Region 11: 53.000s -- 56.050s
Region 12: 56.250s -- 59.500s
Region 13: 59.700s -- 62.550s
Region 14: 63.150s -- 69.600s
Region 15: 69.650s -- 73.100s
Region 16: 73.400s -- 77.450s
Region 17: 77.800s -- 81.150s
Region 18: 81.350s -- 89.100s
Region 19: 89.500s -- 92.750s
Region 20: 92.950s -- 96.250s
Region 21: 96.500s -- 99.600s
Region 22: 99.850s -- 104.350s
Region 23: 104.500s -- 108.050s
regs = auditok.load(input)
regs.split_and_plot(
    min_dur=1,
    max_dur=10,
    max_silence=0.9,
    energy_threshold=20,
    dpi=600
)
[AudioRegion(duration=6.250, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=5.500, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=2.550, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.300, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=10.000, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=4.500, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=4.300, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=4.650, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=2.850, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=2.750, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.400, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.050, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.250, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=2.850, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=6.450, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.450, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=4.050, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.350, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=7.750, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.250, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.300, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.100, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=4.500, sampling_rate=44100, sample_width=2, channels=2),
 AudioRegion(duration=3.550, sampling_rate=44100, sample_width=2, channels=2)]