Interesting links, 22/02/2023

@misc{chen2022resgrad
  doi = {10.48550/ARXIV.2212.14518},
  author = {Chen, Zehua and Wu, Yihan and Leng, Yichong and Chen, Jiawei and Liu, Haohe and Tan, Xu and Cui, Yang and Wang, Ke and He, Lei and Zhao, Sheng and Bian, Jiang and Mandic, Danilo},
  title = {ResGrad: Residual Denoising Diffusion Probabilistic Models for Text to Speech},
  year = {2022},
}

Coqui TTS on CPU Real-Time Spanish Speech Synthesis

CVSS: A Massively Multilingual Speech-to-Speech Translation Corpus

@inproceedings{jia2022cvss,
    title={ {CVSS} Corpus and Massively Multilingual Speech-to-Speech Translation},
    author={Jia, Ye and Tadmor Ramanovich, Michelle and Wang, Quan and Zen, Heiga},
    booktitle={Proceedings of Language Resources and Evaluation Conference (LREC)},
    pages={6691--6703},
    year={2022}
}

FonBund: A Library for Combining Cross-lingual Phonological Segment Data

@inproceedings{46930,
title	= {FonBund: A Library for Combining Cross-lingual Phonological Segment Data},
author	= {Alexander Gutkin and Martin Jansche and Tatiana Merkulova},
year	= {2018},
URL	= {http://www.lrec-conf.org/proceedings/lrec2018/pdf/8889.pdf},
booktitle	= {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
pages	= {2236--2240},
address	= {7-12 May 2018, Miyazaki, Japan}
}

The Norwegian Parliamentary Speech Corpus

The Talk of Norway: a richly annotated corpus of the Norwegian parliament, 1998–2016

@article{lapponi_talk_2018,
        title = {The {Talk} of {Norway}: a richly annotated corpus of the {Norwe
gian} parliament, 1998–2016},
        volume = {52},
        issn = {1574-0218},
        url = {https://doi.org/10.1007/s10579-018-9411-5},
        doi = {10.1007/s10579-018-9411-5},
        number = {3},
        journal = {Language Resources and Evaluation},
        author = {Lapponi, Emanuele and Søyland, Martin G. and Velldal, Erik and Oepen, Stephan},
        month = sep,
        year = {2018},
        pages = {873--893},
}

Counting in Northern Sami – French wiktionary seems to have good inflection information.

Egri Csillagok

Weighted finite-state transducers: the later years

Minimally Supervised Number Normalization

@article{gorman-sproat-2016-minimally,
    title = "Minimally Supervised Number Normalization",
    author = "Gorman, Kyle  and
      Sproat, Richard",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "4",
    year = "2016",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/Q16-1036",
    doi = "10.1162/tacl_a_00114",
    pages = "507--519",
}

Structured abbreviation expansion in context

@inproceedings{gorman-etal-2021-structured-abbreviation,
    title = "Structured abbreviation expansion in context",
    author = "Gorman, Kyle  and
      Kirov, Christo  and
      Roark, Brian  and
      Sproat, Richard",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
    month = nov,
    year = "2021",
    address = "Punta Cana, Dominican Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.findings-emnlp.85",
    doi = "10.18653/v1/2021.findings-emnlp.85",
    pages = "995--1005",
}

FT Speech: Danish Parliament Speech Corpus

@inproceedings{Kirkedal_2020,
	doi = {10.21437/interspeech.2020-3164},
	year = 2020,
	month = {oct},
	publisher = {ISCA},
	author = {Andreas Kirkedal and Marija Stepanovi{\'{c}} and Barbara Plank},
	title = { {FT} Speech: Danish Parliament Speech Corpus},
	booktitle = {Interspeech 2020}
}

Committee-Based Active Learning for Speech Recognition

@article{2011,
  title={Committee-Based Active Learning for Speech Recognition},
  author={Yuzo HAMANAKA and Koichi SHINODA and Takuya TSUTAOKA and Sadaoki FURUI and Tadashi EMORI and Takafumi KOSHINAKA},
  journal={IEICE Transactions on Information and Systems},
  volume={E94.D},
  number={10},
  pages={2015-2023},
  year={2011},
  doi={10.1587/transinf.E94.D.2015}
}

Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions

@misc{li2023phoneme_bert
  doi = {10.48550/ARXIV.2301.08810},
  author = {Li, Yinghao Aaron and Han, Cong and Jiang, Xilin and Mesgarani, Nima},
  title = {Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions},
  publisher = {arXiv},
  year = {2023},
}

Multi-blank Transducers for Speech Recognition

@misc{xu2022multiblank,
  doi = {10.48550/ARXIV.2211.03541},
  author = {Xu, Hainan and Jia, Fei and Majumdar, Somshubra and Watanabe, Shinji and Ginsburg, Boris},
  title = {Multi-blank Transducers for Speech Recognition},
  publisher = {arXiv},
  year = {2022},
}

Alpa: Automated Model-Parallel Deep Learning

Hearing voices at the National Library – a speech corpus and acoustic model for the Swedish language

@misc{malmsten2022kblabb_w2v,
  url = {https://arxiv.org/abs/2205.03026},
  author = {Malmsten, Martin and Haffenden, Chris and Börjeson, Love},
  title = {Hearing voices at the National Library -- a speech corpus and acoustic model for the Swedish language},
  publisher = {arXiv},
  year = {2022},
}

Applications of Lexicographic Semirings to Problems in Speech and Language Processing, pdf

@article{10.1162/COLI_a_00198,
    author = {Sproat, Richard and Yarmohammadi, Mahsa and Shafran, Izhak and Roark, Brian},
    title = "{Applications of Lexicographic Semirings to Problems in Speech and Language Processing}",
    journal = {Computational Linguistics},
    volume = {40},
    number = {4},
    pages = {733-761},
    year = {2014},
    month = {12},
    issn = {0891-2017},
    doi = {10.1162/COLI_a_00198},
    url = {https://doi.org/10.1162/COLI\_a\_00198},
}

Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization

@inproceedings{bakhturina22_interspeech,
  author={Evelina Bakhturina and Yang Zhang and Boris Ginsburg},
  title={ {Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={491--495},
  doi={10.21437/Interspeech.2022-11074}
}

There is more to Hungarian than goulash!

Noise2Music

Augmenting Librispeech with French Translations: A Multimodal Corpus for Direct Speech Translation Evaluation

@inproceedings{kocabiyikoglu-etal-2018-augmenting,
    title = "Augmenting Librispeech with {F}rench Translations: A Multimodal Corpus for Direct Speech Translation Evaluation",
    author = "Kocabiyikoglu, Ali Can  and
      Besacier, Laurent  and
      Kraif, Olivier",
    booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
    month = may,
    year = "2018",
    address = "Miyazaki, Japan",
    publisher = "European Language Resources Association (ELRA)",
    url = "https://aclanthology.org/L18-1001",
}

End-to-End Automatic Speech Translation of Audiobooks, code

@misc{berard2018speechtranslation,
  doi = {10.48550/ARXIV.1802.04200},
  url = {https://arxiv.org/abs/1802.04200},
  author = {Bérard, Alexandre and Besacier, Laurent and Kocabiyikoglu, Ali Can and Pietquin, Olivier},
  title = {End-to-End Automatic Speech Translation of Audiobooks},
  publisher = {arXiv},
  year = {2018},
}

Adding Conditional Control to Text-to-Image Diffusion Models, code

@misc{zhang2023controlnet,
  doi = {10.48550/ARXIV.2302.05543},
  author = {Zhang, Lvmin and Agrawala, Maneesh},
  title = {Adding Conditional Control to Text-to-Image Diffusion Models},
  publisher = {arXiv},
  year = {2023},
}

New Year Concert 2017 Wiener Philarmoniker Part 1

LiroyvH/signal-export – PDF friendly; carderne/signal-export – HTML

erwincoumans/motion_imitation

MILVLG/openvqa

patilli/vqa_benchmarking

Fine-tune FLAN-T5 for chat & dialogue summarization

BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian Active Learning, BlackHC/batchbald_redux

@misc{kirsch2019batchbald,
  doi = {10.48550/ARXIV.1906.08158},
  author = {Kirsch, Andreas and van Amersfoort, Joost and Gal, Yarin},
  title = {BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian Active Learning},
  publisher = {arXiv},
  year = {2019},
}

Dysarthric Speech Recognition From Raw Waveform with Parametric CNNs

@inproceedings{yue22_interspeech,
  author={Zhengjun Yue and Erfan Loweimi and Heidi Christensen and Jon Barker and Zoran Cvetkovic},
  title={ {Dysarthric Speech Recognition From Raw Waveform with Parametric CNNs}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={31--35},
  doi={10.21437/Interspeech.2022-163}
}

Regularizing Transformer-based Acoustic Models by Penalizing Attention Weights

@inproceedings{lee22b_interspeech,
  author={Munhak Lee and Joon-Hyuk Chang and Sang-Eon Lee and Ju-Seok Seong and Chanhee Park and Haeyoung Kwon},
  title={ {Regularizing Transformer-based Acoustic Models by Penalizing Attention Weights}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={56--60},
  doi={10.21437/Interspeech.2022-362}
}

Use of prosodic and lexical cues for disambiguating wh-words in Korean

@inproceedings{song22b_interspeech,
  author={Jieun Song and Hae-Sung Jeon and Jieun Kiaer},
  title={ {Use of prosodic and lexical cues for disambiguating wh-words in Korean}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={81--85},
  doi={10.21437/Interspeech.2022-561}
}

Generalized Keyword Spotting using ASR embeddings

@inproceedings{r22_interspeech,
  author={Kirandevraj R and Vinod Kumar Kurmi and Vinay Namboodiri and C V Jawahar},
  title={ {Generalized Keyword Spotting using ASR embeddings}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={126--130},
  doi={10.21437/Interspeech.2022-10450}
}

VoiceLab: Software for Fully Reproducible Automated Voice Analysis

@inproceedings{feinberg22_interspeech,
  author={David Feinberg},
  title={ {VoiceLab: Software for Fully Reproducible Automated Voice Analysis}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={351--355},
  doi={10.21437/Interspeech.2022-113}
}

Hungarian word order

Hangsúly a magyar nyelvben

A magyar nyelv könyve

neonbjb/tortoise-tts – A multi-voice TTS system trained with an emphasis on quality

TorToiSe - Spending Compute for High Quality TTS

Parsing Icelandic Alþingi Transcripts: Parliamentary Speeches as a Genre

@inproceedings{runarsson-sigurdsson-2020-parsing,
    title = "Parsing {I}celandic Al{\th}ingi Transcripts: Parliamentary Speeches as a Genre",
    author = "R{\'u}narsson, Kristj{\'a}n  and
      Sigur{\dh}sson, Einar Freyr",
    booktitle = "Proceedings of the Second ParlaCLARIN Workshop",
    month = may,
    year = "2020",
    address = "Marseille, France",
    publisher = "European Language Resources Association",
    url = "https://aclanthology.org/2020.parlaclarin-1.9",
    pages = "44--50",
    language = "English",
    ISBN = "979-10-95546-47-4",
}

Unified Speech-Text Pre-training for Speech Translation and Recognition

@misc{tang2022unified,
  doi = {10.48550/ARXIV.2204.05409},
  author = {Tang, Yun and Gong, Hongyu and Dong, Ning and Wang, Changhan and Hsu, Wei-Ning and Gu, Jiatao and Baevski, Alexei and Li, Xian and Mohamed, Abdelrahman and Auli, Michael and Pino, Juan},
  title = {Unified Speech-Text Pre-training for Speech Translation and Recognition},
  year = {2022},
}

s2s-ft: Sequence-to-Sequence Fine-Tuning

SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data

@misc{zhang2022speechlm,
  doi = {10.48550/ARXIV.2209.15329},
  author = {Zhang, Ziqiang and Chen, Sanyuan and Zhou, Long and Wu, Yu and Ren, Shuo and Liu, Shujie and Yao, Zhuoyuan and Gong, Xun and Dai, Lirong and Li, Jinyu and Wei, Furu},
  title = {SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data},
  year = {2022},
}

Active learning in speech recognition

open-mmlab/mmhuman3d – OpenMMLab 3D Human Parametric Model Toolbox and Benchmark

zapis liczb wielocyfrowych – spacjami

r9y9/pyreaper – A python wrapper for REAPER

Fine-tune FLAN-T5 for chat & dialogue summarization

Getting Started with DeepSpeed for Inferencing Transformer based Models

facebookresearch/metaseq – Repo for external large-scale work

NeMo Intro to transducers

NeMo FastPitch

Learning Audio-Video Modalities from Image Captions, github

@inproceedings{nagrani2022learning,
  title = {Learning Audio Video Modalities from Image Captions},
  author = {Nagrani, Arsha and Hongsuck Seo, Paul and Seybold, Bryan, and Hauth Anja, and Santiago, Manen, and Chen, Sun and Schmid, Cordelia},
  booktitle = {ECCV},
  year = {2022},
}

Serving OPT-175B, BLOOM-176B and CodeGen-16B using Alpa

k2-fsa/sherpa – Speech-to-text server framework with next-gen Kaldi

k2-fsa/kaldifst – Python wrapper for OpenFST and its extensions from Kaldi. Also support reading/writing ark/scp files

NeMo Joint Intent and Slot Classification

jonatasgrosman/wav2vec2-large-xlsr-53-hungarian

Telling the time in Hungarian

NbAiLab/whisper-sami-demo, model

Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition

@misc{majumdar2022damagecontrol,
  doi = {10.48550/ARXIV.2210.03255},
  author = {Majumdar, Somshubra and Acharya, Shantanu and Lavrukhin, Vitaly and Ginsburg, Boris},
  title = {Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition},
  year = {2022},
}

jonatasgrosman/wav2vec2-large-xlsr-53-hungarian

vitouphy/wav2vec2-xls-r-300m-phoneme, training

lucidrains/audiolm-pytorch – Implementation of AudioLM, a SOTA Language Modeling Approach to Audio Generation out of Google Research, in Pytorch

lucidrains/PaLM-rlhf-pytorch – Implementation of RLHF (Reinforcement Learning with Human Feedback) on top of the PaLM architecture. Basically ChatGPT but with PaLM

google-research/tuning_playbook – A playbook for systematically maximizing the performance of deep learning models.

AN CAIGHDEÁN OIFIGIÚIL

CLDR Irish

stts-se/wikispeech-annotator

Castles and palaces of Greater Budapest

OpenAI’s Whisper: 7 must-know libraries and add-ons built on top of it

Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers

jumon/zac – Zero-shot Audio Classification using Whisper

linto-ai/whisper-timestamped – Multilingual Automatic Speech Recognition with word-level timestamps and confidence

Guiding Frozen Language Models with Learned Soft Prompts

google-research/prompt-tuning – Original Implementation of Prompt Tuning from Lester, et al, 2021

The Flan Collection: Advancing open source methods for instruction tuning

Active and Semi-Supervised Learning in ASR: Benefits on the Acoustic and Language Models

@article{drugman2019active,
  doi = {10.48550/ARXIV.1903.02852},
  author = {Drugman, Thomas and Pylkkonen, Janne and Kneser, Reinhard},
  title = {Active and Semi-Supervised Learning in ASR: Benefits on the Acoustic and Language Models},
  year = {2019},
}

Domain-Adversarial Training of Neural Networks

@article{ganin2015domainadversarial,
  doi = {10.48550/ARXIV.1505.07818},
  author = {Ganin, Yaroslav and Ustinova, Evgeniya and Ajakan, Hana and Germain, Pascal and Larochelle, Hugo and Laviolette, François and Marchand, Mario and Lempitsky, Victor},
  title = {Domain-Adversarial Training of Neural Networks},
  year = {2015},
}

CLSE: Corpus of Linguistically Significant Entities, corpus

@misc{chuklin2022clse,
  doi = {10.48550/ARXIV.2211.02423},
  author = {Chuklin, Aleksandr and Zhao, Justin and Kale, Mihir},
  title = {CLSE: Corpus of Linguistically Significant Entities},
  year = {2022},
}

Linguistic Framing of Political Terror: Distant and Close Readings of the Discourse on Terrorism in the Swedish Parliament 1993–2018

@inProceedings{angsal2022framing,
	title        = {Linguistic Framing of Political Terror: Distant and Close Readings of the Discourse on Terrorism in the Swedish Parliament 1993–2018},
	booktitle    = {CLARIN Annual Conference Proceedings, 10–12 October 2022, Prague, Czechia. Eds. Tomaž Erjavec & Maria Eskevich},
	author       = {Ängsal, Magnus Pettersson and Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Öhberg, Patrik},
	year         = {2022},
	address      = {Prag},
}

Finland Swedish Automatic Speech Recognition, pdf

@mastersthesis{raitolahti2022,
title={ {Finland Swedish Automatic Speech Recognition}},
author={Raitolahti, Otto-Ville},
year={2022},
language={English},
pages={53},
school={Aalto University. School of Science},
type={Master's thesis},
url={http://urn.fi/URN:NBN:fi:aalto-202203272601}
}

Building an ASR Corpus Using Althingi’s Parliamentary Speeches

@inproceedings{helgadottir2017,
  author={Inga Rún Helgadóttir and Róbert Kjaran and Anna Björk Nikulásdóttir and Jón Guðnason},
  title={Building an ASR Corpus Using Althingi’s Parliamentary Speeches},
  year=2017,
  booktitle={Proc. Interspeech 2017},
  pages={2163--2167},
  doi={10.21437/Interspeech.2017-903},
  url={http://dx.doi.org/10.21437/Interspeech.2017-903}
}

Streaming model for Acoustic to Articulatory Inversion with transformer networks, pdf

@inproceedings{udupa22_interspeech,
  author={Sathvik Udupa and Aravind Illa and Prasanta Ghosh},
  title={ {Streaming model for Acoustic to Articulatory Inversion with transformer networks}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={625--629},
  doi={10.21437/Interspeech.2022-10159}
}

Acquisition of allophonic variation in second language speech: An acoustic and articulatory study of English laterals by Japanese speakers, pdf

@inproceedings{nagamine22_interspeech,
  author={Takayuki Nagamine},
  title={ {Acquisition of allophonic variation in second language speech: An acoustic and articulatory study of English laterals by Japanese speakers}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={644--648},
  doi={10.21437/Interspeech.2022-11020}
}

Unsupervised Text-to-Speech Synthesis by Unsupervised Automatic Speech Recognition, pdf

@inproceedings{ni22_interspeech,
  author={Junrui Ni and Liming Wang and Heting Gao and Kaizhi Qian and Yang Zhang and Shiyu Chang and Mark Hasegawa-Johnson},
  title={ {Unsupervised Text-to-Speech Synthesis by Unsupervised Automatic Speech Recognition}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={461--465},
  doi={10.21437/Interspeech.2022-816}
}

Mixed-Phoneme BERT: Improving BERT with Mixed Phoneme and Sup-Phoneme Representations for Text to Speech, pdf

@inproceedings{zhang22i_interspeech,
  author={Guangyan Zhang and Kaitao Song and Xu Tan and Daxin Tan and Yuzi Yan and Yanqing Liu and Gang Wang and Wei Zhou and Tao Qin and Tan Lee and Sheng Zhao},
  title={ {Mixed-Phoneme BERT: Improving BERT with Mixed Phoneme and Sup-Phoneme Representations for Text to Speech}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={456--460},
  doi={10.21437/Interspeech.2022-621}
}

SIGUL 2023

Active learning in speech recognition

Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models

@misc{vijayakumar2016diversebeamsearch,
  doi = {10.48550/ARXIV.1610.02424},
  author = {Vijayakumar, Ashwin K and Cogswell, Michael and Selvaraju, Ramprasath R. and Sun, Qing and Lee, Stefan and Crandall, David and Batra, Dhruv},
  title = {Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models},
  year = {2016},
}

Towards Multimodal Sarcasm Detection (An Obviously Perfect Paper)

@inproceedings{castro-etal-2019-towards,
    title = "Towards Multimodal Sarcasm Detection (An {\_}{O}bviously{\_} Perfect Paper)",
    author = "Castro, Santiago  and
      Hazarika, Devamanyu  and
      P{\'e}rez-Rosas, Ver{\'o}nica  and
      Zimmermann, Roger  and
      Mihalcea, Rada  and
      Poria, Soujanya",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P19-1455",
    doi = "10.18653/v1/P19-1455",
    pages = "4619--4629",
}

jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli

facebook/wav2vec2-xlsr-53-phon-cv-babel-ft

Aditya3107/wav2vec2-Irish-common-voice-Fleurs-living-audio-300m

microsoft/trocr-large-handwritten

BEiT v2 Pretraining

TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models

@misc{li2021trocr,
  doi = {10.48550/ARXIV.2109.10282},
  author = {Li, Minghao and Lv, Tengchao and Chen, Jingye and Cui, Lei and Lu, Yijuan and Florencio, Dinei and Zhang, Cha and Li, Zhoujun and Wei, Furu},
  title = {TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models},
  year = {2021},
}

Pauses, gaps and overlaps in conversations

dmort27/morphotactics – Library for implementing morphotactic FSTs using Pynini and OpenFST

25 Hungarian Words that every foreigner should learn

LMC-SMCA: A New Active Learning Method in ASR

@ARTICLE{9363163,
  author={Sun, Xiusong and Wang, Bo and Liu, Shaohan and Lu, Tingxiang and Shan, Xin and Yang, Qun},
  journal={IEEE Access}, 
  title={LMC-SMCA: A New Active Learning Method in ASR}, 
  year={2021},
  volume={9},
  number={},
  pages={37011-37021},
  doi={10.1109/ACCESS.2021.3062157}}

Active Learning For Automatic Speech Recognition

@article{hakkanitur2002active,
author = {Hakkani-Tur, Dilek and Gorin, Allen},
year = {2002},
month = {09},
pages = {},
title = {Active Learning For Automatic Speech Recognition},
journal = {Acoustics, Speech, and Signal Processing, 1988. ICASSP-88., 1988 International Conference on},
doi = {10.1109/ICASSP.2002.5745510}
}

Maximizing global entropy reduction for active learning in speech recognition

@INPROCEEDINGS{4960685,
  author={Varadarajan, Balakrishnan and Yu, Dong and Li Deng and Acero, Alex},
  booktitle={2009 IEEE International Conference on Acoustics, Speech and Signal Processing}, 
  title={Maximizing global entropy reduction for active learning in speech recognition}, 
  year={2009},
  volume={},
  number={},
  pages={4721-4724},
  doi={10.1109/ICASSP.2009.4960685}}

Active learning for accent adaptation in Automatic Speech Recognition

@INPROCEEDINGS{6424250,
  author={Nallasamy, Udhyakumar and Metze, Florian and Schultz, Tanja},
  booktitle={2012 IEEE Spoken Language Technology Workshop (SLT)}, 
  title={Active learning for accent adaptation in Automatic Speech Recognition}, 
  year={2012},
  volume={},
  number={},
  pages={360-365},
  doi={10.1109/SLT.2012.6424250}}

Active learning: theory and applications to automatic speech recognition

@ARTICLE{1453593,
  author={Riccardi, G. and Hakkani-Tur, D.},
  journal={IEEE Transactions on Speech and Audio Processing}, 
  title={Active learning: theory and applications to automatic speech recognition}, 
  year={2005},
  volume={13},
  number={4},
  pages={504-511},
  doi={10.1109/TSA.2005.848882}}

A confusion network based confidence measure for active learning in speech recognition

@INPROCEEDINGS{4906813,
  author={Chen, Wei and Liu, Gang and Guo, Jun},
  booktitle={2008 International Conference on Natural Language Processing and Knowledge Engineering}, 
  title={A confusion network based confidence measure for active learning in speech recognition}, 
  year={2008},
  volume={},
  number={},
  pages={1-6},
  doi={10.1109/NLPKE.2008.4906813}}

Active learning for automatic speech recognition

@INPROCEEDINGS{5745510,
  author={Hakkani-Tür, Dilek and Riccardi, Giuseppe and Gorin, Allen},
  booktitle={2002 IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
  title={Active learning for automatic speech recognition}, 
  year={2002},
  volume={4},
  number={},
  pages={IV-3904-IV-3907},
  doi={10.1109/ICASSP.2002.5745510}}

karpathy/nn-zero-to-hero

ej0cl6/deep-active-learning

Overview of Active Learning for Deep Learning

SpeechPainter: Text-conditioned Speech Inpainting

@inproceedings{borsos22_interspeech,
  author={Zalan Borsos and Matthew Sharifi and Marco Tagliasacchi},
  title={ {SpeechPainter: Text-conditioned Speech Inpainting}},
  year=2022,
  booktitle={Proc. Interspeech 2022},
  pages={431--435},
  doi={10.21437/Interspeech.2022-194}
}

Active and unsupervised learning for automatic speech recognition, pdf

@inproceedings{riccardi03_eurospeech,
  author={Giuseppe Riccardi and Dilek Z. Hakkani-Tur},
  title={ {Active and unsupervised learning for automatic speech recognition}},
  year=2003,
  booktitle={Proc. 8th European Conference on Speech Communication and Technology (Eurospeech 2003)},
  pages={1825--1828},
  doi={10.21437/Eurospeech.2003-552}
}

Committee-Based Active Learning for Speech Recognition, pdf

@article{hamanaka2011committee,
  title={Committee-Based Active Learning for Speech Recognition},
  author={Yuzo HAMANAKA and Koichi SHINODA and Takuya TSUTAOKA and Sadaoki FURUI and Tadashi EMORI and Takafumi KOSHINAKA},
  journal={IEICE Transactions on Information and Systems},
  volume={E94.D},
  number={10},
  pages={2015-2023},
  year={2011},
  doi={10.1587/transinf.E94.D.2015}
}

Adaptable End-to-End ASR Models using Replaceable Internal LMs and Residual Softmax

@misc{deng2023adaptableasr,
  doi = {10.48550/ARXIV.2302.08579},
  author = {Deng, Keqi and Woodland, Philip C.},
  title = {Adaptable End-to-End ASR Models using Replaceable Internal LMs and Residual Softmax},
  year = {2023},
}