%pip install git+https://github.com/jimregan/Matcha-TTS@more-data

!apt install espeak-ng

!git lfs install

Git LFS initialized.

!git clone https://huggingface.co/jimregan/matcha-pl-gosia

!matcha-tts --vocoder hifigan_univ_v1 --model matcha_vctk --text "test to start" --output_folder /content/output

/usr/local/lib/python3.10/dist-packages/matcha/cli.py:182: UserWarning: [!] Speaker ID not provided! Using speaker ID 0
  warnings.warn(warn_, UserWarning)
[-] GPU not available or forced CPU run! Using CPU
[!] Configurations: 
	- Model: matcha_vctk
	- Vocoder: hifigan_univ_v1
	- Temperature: 0.667
	- Speaking rate: 0.85
	- Number of ODE steps: 10
	- Speaker: 0
[-] Model not found at /root/.local/share/matcha_tts/matcha_vctk.ckpt! Will download it
[-] Model not found at /root/.local/share/matcha_tts/hifigan_univ_v1! Will download it
[!] Loading matcha_vctk!
/usr/local/lib/python3.10/dist-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
[+] matcha_vctk loaded!
[!] Loading hifigan_univ_v1!
/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
  WeightNorm.apply(module, name, dim)
/usr/local/lib/python3.10/dist-packages/matcha/cli.py:87: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
Removing weight norm...
[+] hifigan_univ_v1 loaded!
====================================================================================================
[1] - Input text: test to start
[1] - Phonetised text: tˈɛst tə stˈɑːɹt
[🍵] Whisking Matcha-T(ea)TS for: 1
[🍵-1] Matcha-TTS RTF: 0.4420
[🍵-1] Matcha-TTS + VOCODER RTF: 1.5224
[+] Waveform saved: /content/output/utterance_001_speaker_000.wav
====================================================================================================
[🍵] Average Matcha-TTS RTF: 0.4420 ± 0.0
[🍵] Average Matcha-TTS + VOCODER RTF: 1.5224 ± 0.0
[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!ls -al /root/.local/share/matcha_tts/hifigan_univ_v1

-rw-r--r-- 1 root root 55788858 Oct 20 19:43 /root/.local/share/matcha_tts/hifigan_univ_v1

import datetime as dt
from pathlib import Path

import IPython.display as ipd
import numpy as np
import soundfile as sf
import torch
from tqdm.auto import tqdm

# Hifigan imports
from matcha.hifigan.config import v1
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.env import AttrDict
from matcha.hifigan.models import Generator as HiFiGAN
# Matcha imports
from matcha.models.matcha_tts import MatchaTTS
from matcha.text import sequence_to_text, text_to_sequence
from matcha.utils.model import denormalize
from matcha.utils.utils import get_user_data_dir, intersperse

MATCHA_CHECKPOINT = "/content/matcha-pl-gosia/checkpoints/last.ckpt"
HIFIGAN_CHECKPOINT = "/root/.local/share/matcha_tts/hifigan_univ_v1"
OUTPUT_FOLDER = "synth_output"

def load_model(checkpoint_path):
    model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
    model.eval()
    return model
count_params = lambda x: f"{sum(p.numel() for p in x.parameters()):,}"


model = load_model(MATCHA_CHECKPOINT)
print(f"Model loaded! Parameter count: {count_params(model)}")

/usr/local/lib/python3.10/dist-packages/diffusers/models/lora.py:393: FutureWarning: `LoRACompatibleLinear` is deprecated and will be removed in version 1.0.0. Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`.
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)

Model loaded! Parameter count: 18,204,193

def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
denoiser = Denoiser(vocoder, mode='zeros')

/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
  WeightNorm.apply(module, name, dim)
<ipython-input-15-f37e5e9a1c3a>:4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])

Removing weight norm...

@torch.inference_mode()
def process_text(text: str):
    x = torch.tensor(intersperse(text_to_sequence(text, ['polish_cleaners'])[0], 0),dtype=torch.long, device=device)[None]
    x_lengths = torch.tensor([x.shape[-1]],dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    return {
        'x_orig': text,
        'x': x,
        'x_lengths': x_lengths,
        'x_phones': x_phones
    }


@torch.inference_mode()
def synthesise(text, spks=None):
    text_processed = process_text(text)
    start_t = dt.datetime.now()
    output = model.synthesise(
        text_processed['x'],
        text_processed['x_lengths'],
        n_timesteps=n_timesteps,
        temperature=temperature,
        spks=spks,
        length_scale=length_scale
    )
    # merge everything to one dict
    output.update({'start_t': start_t, **text_processed})
    return output

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
    return audio.cpu().squeeze()

def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')

n_timesteps = 10

## Changes to the speaking rate
length_scale=1.0

## Sampling temperature
temperature = 0.667

texts = [
    "Bóbr! Ja pierdolę! Jakie bydlę!"
]

outputs, rtfs = [], []
rtfs_w = []
for i, text in enumerate(tqdm(texts)):
    output = synthesise(text) #, torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
    output['waveform'] = to_waveform(output['mel'], vocoder)

    # Compute Real Time Factor (RTF) with HiFi-GAN
    t = (dt.datetime.now() - output['start_t']).total_seconds()
    rtf_w = t * 22050 / (output['waveform'].shape[-1])

    ## Pretty print
    print(f"{'*' * 53}")
    print(f"Input text - {i}")
    print(f"{'-' * 53}")
    print(output['x_orig'])
    print(f"{'*' * 53}")
    print(f"Phonetised text - {i}")
    print(f"{'-' * 53}")
    print(output['x_phones'])
    print(f"{'*' * 53}")
    print(f"RTF:\t\t{output['rtf']:.6f}")
    print(f"RTF Waveform:\t{rtf_w:.6f}")
    rtfs.append(output['rtf'])
    rtfs_w.append(rtf_w)

    ## Display the synthesised waveform
    ipd.display(ipd.Audio(output['waveform'], rate=22050))

    ## Save the generated waveform
    save_to_folder(i, output, OUTPUT_FOLDER)

print(f"Number of ODE steps: {n_timesteps}")
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
print(f"Mean RTF Waveform (incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}")

*****************************************************
Input text - 0
-----------------------------------------------------
Bóbr! Ja pierdolę! Jakie bydlę!
*****************************************************
Phonetised text - 0
-----------------------------------------------------
_b_ˈ_u_b_r_!_ _j_a_ _p_ʲ_ɛ_r_d_ˈ_ɔ_l_ɛ_!_ _j_ˈ_a_k_ʲ_ɛ_ _b_ˈ_ɨ_d_l_ɛ_!_
*****************************************************
RTF:		0.540975
RTF Waveform:	2.019202

Number of ODE steps: 10
Mean RTF:				0.540975 ± 0.000000
Mean RTF Waveform (incl. vocoder):	2.019202 ± 0.000000