Trying to recreate Praat spectrograms
By prompting ChatGPT
I'm trying to get a decent-looking, Praat-like spectrogram from Python... Using ChatGPT, because I'm too lazy to figure this out by myself, though I think with the amount of prompting it took to get this far, I've probably used more time than I would have otherwise.
Praat gives me this:
from IPython import display
display.Image("/content/Screenshot 2025-03-23 at 17.29.04.png")
import numpy as np
import matplotlib.pyplot as plt
import librosa
from scipy.signal.windows import gaussian
def praat_striation_spectrogram(
filepath,
sr=None,
window_length_sec=0.003, # shorter window for voicing detail
hop_length_sec=0.001, # high time resolution
max_freq=5000,
dynamic_range=60,
preemph_cutoff=50,
cmap='gray_r',
dpi=200,
show_plot=True,
save_path=None
):
# Load audio
y, sr = librosa.load(filepath, sr=sr)
# Pre-emphasis
pre_emph_coeff = np.exp(-2 * np.pi * preemph_cutoff / sr)
y = np.append(y[0], y[1:] - pre_emph_coeff * y[:-1])
# Window/hop
win_length = int(window_length_sec * sr)
hop_length = int(hop_length_sec * sr)
n_fft = 2 ** int(np.ceil(np.log2(win_length)))
window = gaussian(win_length, std=win_length / 6)
# STFT
S = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
win_length=win_length, window=window)
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
# Clip to dynamic range
S_db[S_db < (np.max(S_db) - dynamic_range)] = np.max(S_db) - dynamic_range
# Display
times = np.linspace(0, len(y) / sr, S_db.shape[1])
freqs = np.linspace(0, sr / 2, S_db.shape[0])
if show_plot or save_path:
fig, ax = plt.subplots(figsize=(12, 4), dpi=dpi)
img = ax.imshow(S_db,
origin='lower',
aspect='auto',
interpolation='nearest', # <- CRUCIAL for detail
cmap=cmap,
extent=[times[0], times[-1], freqs[0], freqs[-1]])
ax.set_title("Praat-style Spectrogram (Striation-focused)")
ax.set_xlabel("Time (s)")
ax.set_ylabel("Frequency (Hz)")
ax.set_ylim(0, max_freq)
fig.colorbar(img, ax=ax, label='dB')
plt.tight_layout()
if save_path:
plt.savefig(save_path, bbox_inches='tight')
if show_plot:
plt.show()
return S_db, sr
import librosa
import numpy as np
import matplotlib.pyplot as plt
example = librosa.example('libri1')
example
praat_striation_spectrogram("/content/testclip.wav", save_path="thing")
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal.windows import gaussian
from scipy.ndimage import zoom
# --------------------------
# STFT-BASED SPECTROGRAM
# --------------------------
def praat_stft_spectrogram(
filepath,
sr=None,
window_length_sec=0.003,
hop_length_sec=0.001,
max_freq=5000,
dynamic_range=60,
preemph_cutoff=50,
dpi=300,
upsample_factor=2,
cmap='gray_r'
):
y, sr = librosa.load(filepath, sr=sr)
# Pre-emphasis
pre_emph_coeff = np.exp(-2 * np.pi * preemph_cutoff / sr)
y = np.append(y[0], y[1:] - pre_emph_coeff * y[:-1])
win_length = int(window_length_sec * sr)
hop_length = int(hop_length_sec * sr)
n_fft = 2 ** int(np.ceil(np.log2(win_length)))
window = gaussian(win_length, std=win_length / 6)
S = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
win_length=win_length, window=window)
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
# Dynamic range
S_db[S_db < (np.max(S_db) - dynamic_range)] = np.max(S_db) - dynamic_range
# Upsample spectrogram
S_db_upsampled = zoom(S_db, zoom=(upsample_factor, upsample_factor), order=3)
times_up = np.linspace(0, len(y) / sr, S_db_upsampled.shape[1])
freqs_up = np.linspace(0, sr / 2, S_db_upsampled.shape[0])
# Display
plt.figure(figsize=(16, 4), dpi=dpi)
plt.imshow(S_db_upsampled, origin='lower', aspect='auto',
interpolation='none', cmap=cmap,
extent=[times_up[0], times_up[-1], freqs_up[0], freqs_up[-1]])
plt.title("High-Resolution STFT Spectrogram (Praat-style, Upsampled)")
plt.xlabel("Time (s)")
plt.ylabel("Frequency (Hz)")
plt.ylim(0, max_freq)
plt.colorbar(label='dB')
plt.tight_layout()
plt.show()
return S_db_upsampled, sr
# --------------------------
# CWT-BASED SPECTROGRAM
# --------------------------
def wavelet_spectrogram(
filepath,
sr=None,
fmin=80,
fmax=5000,
voices_per_octave=48,
dynamic_range=60,
dpi=300,
cmap='gray_r'
):
import librosa.display
y, sr = librosa.load(filepath, sr=sr)
# CWT using the constant-Q transform
cqt = librosa.cqt(y, sr=sr, fmin=fmin, n_bins=int(np.ceil(voices_per_octave * np.log2(fmax / fmin))),
bins_per_octave=voices_per_octave)
C_db = librosa.amplitude_to_db(np.abs(cqt), ref=np.max)
C_db[C_db < (np.max(C_db) - dynamic_range)] = np.max(C_db) - dynamic_range
times = librosa.frames_to_time(np.arange(C_db.shape[1]), sr=sr)
freqs = librosa.cqt_frequencies(C_db.shape[0], fmin=fmin, bins_per_octave=voices_per_octave)
# Display
plt.figure(figsize=(16, 4), dpi=dpi)
plt.imshow(C_db, origin='lower', aspect='auto', interpolation='bicubic',
cmap=cmap, extent=[times[0], times[-1], freqs[0], freqs[-1]])
plt.title("Wavelet Spectrogram (CWT / Constant-Q)")
plt.xlabel("Time (s)")
plt.ylabel("Frequency (Hz)")
plt.ylim(0, fmax)
plt.colorbar(label='dB')
plt.tight_layout()
plt.show()
return C_db, sr
praat_stft_spectrogram("/content/testclip.wav")
wavelet_spectrogram("/content/testclip.wav")