%%capture
%pip install git+https://github.com/shivammehta25/Matcha-TTS

%%capture
!apt install espeak-ng

!git lfs install

Git LFS initialized.

from huggingface_hub import notebook_login

notebook_login()

!git clone https://huggingface.co/jimregan/matcha-mmconv-test1

Cloning into 'matcha-mmconv-test1'...
remote: Enumerating objects: 202, done.
remote: Counting objects: 100% (199/199), done.
remote: Compressing objects: 100% (177/177), done.
remote: Total 202 (delta 33), reused 0 (delta 0), pack-reused 3 (from 1)
Receiving objects: 100% (202/202), 28.85 KiB | 7.21 MiB/s, done.
Resolving deltas: 100% (33/33), done.
Filtering content: 100% (19/19), 2.29 GiB | 9.25 MiB/s, done.
Encountered 1 file(s) that may not have been copied correctly on Windows:
	tensorboard/version_0/events.out.tfevents.1729625755.096b91286027.10182.0

See: `git lfs help smudge` for more details.

!matcha-tts --vocoder hifigan_univ_v1 --model matcha_vctk --text "test to start" --output_folder /content/output

/usr/local/lib/python3.10/dist-packages/matcha/cli.py:182: UserWarning: [!] Speaker ID not provided! Using speaker ID 0
  warnings.warn(warn_, UserWarning)
[-] GPU not available or forced CPU run! Using CPU
[!] Configurations: 
	- Model: matcha_vctk
	- Vocoder: hifigan_univ_v1
	- Temperature: 0.667
	- Speaking rate: 0.85
	- Number of ODE steps: 10
	- Speaker: 0
[-] Model not found at /root/.local/share/matcha_tts/matcha_vctk.ckpt! Will download it
[-] Model not found at /root/.local/share/matcha_tts/hifigan_univ_v1! Will download it
[!] Loading matcha_vctk!
/usr/local/lib/python3.10/dist-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
[+] matcha_vctk loaded!
[!] Loading hifigan_univ_v1!
/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
  WeightNorm.apply(module, name, dim)
/usr/local/lib/python3.10/dist-packages/matcha/cli.py:87: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
Removing weight norm...
[+] hifigan_univ_v1 loaded!
====================================================================================================
[1] - Input text: test to start
[1] - Phonetised text: tˈɛst tə stˈɑːɹt
[🍵] Whisking Matcha-T(ea)TS for: 1
[🍵-1] Matcha-TTS RTF: 1.3030
[🍵-1] Matcha-TTS + VOCODER RTF: 3.1106
[+] Waveform saved: /content/output/utterance_001_speaker_000.wav
====================================================================================================
[🍵] Average Matcha-TTS RTF: 1.3030 ± 0.0
[🍵] Average Matcha-TTS + VOCODER RTF: 3.1106 ± 0.0
[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!

!ls -al /root/.local/share/matcha_tts/hifigan_univ_v1

-rw-r--r-- 1 root root 55788858 Oct 26 16:04 /root/.local/share/matcha_tts/hifigan_univ_v1

import datetime as dt
from pathlib import Path

import IPython.display as ipd
import numpy as np
import soundfile as sf
import torch
from tqdm.auto import tqdm

# Hifigan imports
from matcha.hifigan.config import v1
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.env import AttrDict
from matcha.hifigan.models import Generator as HiFiGAN
# Matcha imports
from matcha.models.matcha_tts import MatchaTTS
from matcha.text import sequence_to_text, text_to_sequence
from matcha.utils.model import denormalize
from matcha.utils.utils import get_user_data_dir, intersperse

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MATCHA_CHECKPOINT = "/content/matcha-mmconv-test1/checkpoints/last.ckpt"
HIFIGAN_CHECKPOINT = "/root/.local/share/matcha_tts/hifigan_univ_v1"
OUTPUT_FOLDER = "synth_output"

def load_model(checkpoint_path):
    model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
    model.eval()
    return model
count_params = lambda x: f"{sum(p.numel() for p in x.parameters()):,}"


model = load_model(MATCHA_CHECKPOINT)
print(f"Model loaded! Parameter count: {count_params(model)}")

/usr/local/lib/python3.10/dist-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)

Model loaded! Parameter count: 18,204,193

def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
denoiser = Denoiser(vocoder, mode='zeros')

/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
  WeightNorm.apply(module, name, dim)
<ipython-input-13-f37e5e9a1c3a>:4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])

Removing weight norm...

@torch.inference_mode()
def process_text(text: str):
    x = torch.tensor(intersperse(text_to_sequence(text, ['english_cleaners2'])[0], 0),dtype=torch.long, device=device)[None]
    x_lengths = torch.tensor([x.shape[-1]],dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    return {
        'x_orig': text,
        'x': x,
        'x_lengths': x_lengths,
        'x_phones': x_phones
    }


@torch.inference_mode()
def synthesise(text, spks=None):
    text_processed = process_text(text)
    start_t = dt.datetime.now()
    output = model.synthesise(
        text_processed['x'],
        text_processed['x_lengths'],
        n_timesteps=n_timesteps,
        temperature=temperature,
        spks=spks,
        length_scale=length_scale
    )
    # merge everything to one dict
    output.update({'start_t': start_t, **text_processed})
    return output

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
    return audio.cpu().squeeze()

def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')

n_timesteps = 10

## Changes to the speaking rate
length_scale=1.0

## Sampling temperature
temperature = 0.667

texts = [
    "It's a little bit confusing"
]

outputs, rtfs = [], []
rtfs_w = []
for i, text in enumerate(tqdm(texts)):
    output = synthesise(text) #, torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
    output['waveform'] = to_waveform(output['mel'], vocoder)

    # Compute Real Time Factor (RTF) with HiFi-GAN
    t = (dt.datetime.now() - output['start_t']).total_seconds()
    rtf_w = t * 22050 / (output['waveform'].shape[-1])

    ## Pretty print
    print(f"{'*' * 53}")
    print(f"Input text - {i}")
    print(f"{'-' * 53}")
    print(output['x_orig'])
    print(f"{'*' * 53}")
    print(f"Phonetised text - {i}")
    print(f"{'-' * 53}")
    print(output['x_phones'])
    print(f"{'*' * 53}")
    print(f"RTF:\t\t{output['rtf']:.6f}")
    print(f"RTF Waveform:\t{rtf_w:.6f}")
    rtfs.append(output['rtf'])
    rtfs_w.append(rtf_w)

    ## Display the synthesised waveform
    ipd.display(ipd.Audio(output['waveform'], rate=22050))

    ## Save the generated waveform
    save_to_folder(i, output, OUTPUT_FOLDER)

print(f"Number of ODE steps: {n_timesteps}")
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
print(f"Mean RTF Waveform (incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}")

*****************************************************
Input text - 0
-----------------------------------------------------
It's a little bit confusing
*****************************************************
Phonetised text - 0
-----------------------------------------------------
_ɪ_t_s_ _ɐ_ _l_ˈ_ɪ_ɾ_ə_l_ _b_ˈ_ɪ_t_ _k_ə_n_f_j_ˈ_u_ː_z_ɪ_ŋ_
*****************************************************
RTF:		0.388168
RTF Waveform:	2.407171

Number of ODE steps: 10
Mean RTF:				0.388168 ± 0.000000
Mean RTF Waveform (incl. vocoder):	2.407171 ± 0.000000