This is based on this. The main difference is the script that's being run, and setting up flashlight's python bindings

I already had the GAN model in gdrive; those files are available here.

Preparation

!pip install condacolab

Collecting condacolab
  Using cached condacolab-0.1.2-py3-none-any.whl (6.0 kB)
Installing collected packages: condacolab
Successfully installed condacolab-0.1.2

import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!

%%capture
!conda install -c pykaldi pykaldi -y

!git clone https://github.com/pytorch/fairseq/

fatal: destination path 'fairseq' already exists and is not an empty directory.

!git clone https://github.com/kpu/kenlm

fatal: destination path 'kenlm' already exists and is not an empty directory.

%%capture
!apt-get -y install libeigen3-dev liblzma-dev zlib1g-dev libbz2-dev

The python build doesn't build utils, so this is (probably) necessary

%cd /content/kenlm
!mkdir build
%cd build
!cmake ..
!make -j 4

%%capture
%cd /content/kenlm
!python setup.py install
%cd /tmp

import os
os.environ['PATH'] = f"{os.environ['PATH']}:/content/kenlm/build/bin/"
os.environ['FAIRSEQ_ROOT'] = '/content/fairseq'

%cd /content/fairseq/

/content/fairseq

For next cell, see here

%%capture
!pip install --editable ./
!python setup.py build develop

os.environ['HYDRA_FULL_ERROR'] = '1'

%%capture
!pip install editdistance

https://colab.research.google.com/github/corrieann/kaggle/blob/master/kaggle_api_in_colab.ipynb

%%capture
!pip install kaggle

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 64 bytes

%cd /content

/content

!kaggle datasets download "jimregan/w2vu-cvsv-audio-processed"

Downloading w2vu-cvsv-audio-processed.zip to /content
100% 4.31G/4.31G [01:24<00:00, 102MB/s] 
100% 4.31G/4.31G [01:24<00:00, 54.7MB/s]

%%capture
!unzip /content/w2vu-cvsv-audio-processed.zip

!kaggle datasets download -d jimregan/w2vu-cvsv-prepared-text

Downloading w2vu-cvsv-prepared-text.zip to /content
 80% 14.0M/17.4M [00:00<00:00, 33.7MB/s]
100% 17.4M/17.4M [00:00<00:00, 44.2MB/s]

%%capture
!unzip w2vu-cvsv-prepared-text.zip

!rm *.zip

!cp /content/preppedtext/phones/dict* /content/precompute_pca512_cls128_mean

%cd /content

/content

!git clone https://github.com/flashlight/flashlight

Cloning into 'flashlight'...
remote: Enumerating objects: 17649, done.
remote: Counting objects: 100% (1523/1523), done.
remote: Compressing objects: 100% (718/718), done.
remote: Total 17649 (delta 827), reused 1336 (delta 761), pack-reused 16126
Receiving objects: 100% (17649/17649), 14.23 MiB | 24.82 MiB/s, done.
Resolving deltas: 100% (12298/12298), done.

%%capture
!apt install -q libfftw3-dev

Reading package lists...
Building dependency tree...
Reading state information...
libfftw3-dev is already the newest version (3.3.7-1).
cmake is already the newest version (3.10.2-1ubuntu2.18.04.1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.

%cd flashlight/bindings/python

/content/flashlight/bindings/python

%%capture
!pip install packaging

!USE_MKL=0 KENLM_ROOT=/content/kenlm python setup.py install

w2vu-generate

import torch
torch.version.cuda

'10.2'

torch.backends.cudnn.version()

7605

%cd /content/fairseq

/content/fairseq

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

%cd /content/fairseq/examples/wav2vec/unsupervised

/content/fairseq/examples/wav2vec/unsupervised

%%writefile rungan.sh
python w2vu_generate.py --config-dir config/generate --config-name viterbi \
fairseq.common.user_dir=/content/fairseq/examples/wav2vec/unsupervised \
fairseq.task.data=/content/precompute_pca512_cls128_mean \
fairseq.common_eval.path=/content/drive/MyDrive/w2vu/checkpoint_best.pt \
fairseq.dataset.gen_subset=valid results_path=/content/drive/MyDrive/w2vures

Overwriting rungan.sh

!bash rungan.sh

[2021-06-19 19:24:18,601][__main__][INFO] - {'_name': None, 'fairseq': {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/content/fairseq/examples/wav2vec/unsupervised', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': '/content/drive/MyDrive/w2vu/checkpoint_best.pt', 'post_process': None, 'quiet': True, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': pytorch_ddp, 'ddp_comm_hook': none, 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_algorithm': 'LocalSGD', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': never, 'zero_sharding': none, 'fp16': '${common.fp16}', 'memory_efficient_fp16': '${common.memory_efficient_fp16}', 'tpu': '${common.tpu}', 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': None, 'batch_size': 1, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': '${dataset.max_tokens}', 'batch_size_valid': '${dataset.batch_size}', 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'valid', 'num_shards': 1, 'shard_id': 0}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 0, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.25], 'stop_min_lr': -1.0, 'use_bmuf': False}, 'checkpoint': {'_name': None, 'save_dir': 'checkpoints', 'restore_file': 'checkpoint_last.pt', 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': False, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': '${common.model_parallel_size}'}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': '${distributed_training.distributed_world_size}'}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': '???', 'task': {'_name': 'unpaired_audio_text', 'labels': 'phn', 'data': '/content/precompute_pca512_cls128_mean', 'sort_by_length': False, 'shuffle': False, 'text_data': ''}, 'criterion': None, 'optimizer': None, 'lr_scheduler': None, 'scoring': None, 'bpe': None, 'tokenizer': None}, 'lm_weight': 2.0, 'w2l_decoder': <DecoderType.VITERBI: 1>, 'kaldi_decoder_config': None, 'lexicon': None, 'lm_model': None, 'unit_lm': False, 'beam_threshold': 50.0, 'beam_size_token': 100.0, 'beam': 5, 'nbest': 1, 'word_score': 1.0, 'unk_weight': -inf, 'sil_weight': 0.0, 'targets': None, 'results_path': '/content/drive/MyDrive/w2vures', 'post_process': 'silence', 'vocab_usage_power': 2.0, 'viterbi_transcript': None, 'min_lm_ppl': 0.0, 'min_vt_uer': 0.0, 'blank_weight': 0.0, 'blank_mode': 'set', 'sil_is_blank': False, 'unsupervised_tuning': False, 'is_ax': False, 'job_logging_cfg': {'version': 1, 'formatters': {'simple': {'format': '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'}}, 'handlers': {'console': {'class': 'logging.StreamHandler', 'formatter': 'simple', 'stream': 'ext://sys.stdout'}, 'file': {'class': 'logging.FileHandler', 'formatter': 'simple', 'filename': 'w2vu_generate.log'}}, 'root': {'level': 'INFO', 'handlers': ['console', 'file']}, 'disable_existing_loggers': False}}
[2021-06-19 19:24:18,629][__main__][INFO] - | loading model(s) from /content/drive/MyDrive/w2vu/checkpoint_best.pt
[2021-06-19 19:24:22,829][unsupervised.data.extracted_features_dataset][INFO] - loaded 2019, skipped 0 samples
[2021-06-19 19:24:22,829][unsupervised.tasks.unpaired_audio_text][INFO] - split valid has unpaired text? False
[2021-06-19 19:24:22,833][__main__][INFO] - | /content/precompute_pca512_cls128_mean valid 2019 examples
[2021-06-19 19:24:41,807][__main__][INFO] - WER: 158.79173813943652
[2021-06-19 19:24:41,808][__main__][INFO] - | Processed 2019 sentences (80270 tokens) in 18.9s (106.61 sentences/s, 4238.35 tokens/s)
[2021-06-19 19:24:41,809][__main__][INFO] - | Generate valid with beam=5, lm_weight=2.0, word_score=1.0, sil_weight=0.0, blank_weight=0.0, WER: 158.79173813943652, LM_PPL: inf, num feats: 130753, length: 80270, UER to viterbi: 0, score: inf