Interesting links, 22/02/2023
Misc. interesting things.
20 Open-Source Single Speaker Speech Datasets
ResGrad: Residual Denoising Diffusion Probabilistic Models for Text to Speech
@misc{chen2022resgrad
doi = {10.48550/ARXIV.2212.14518},
author = {Chen, Zehua and Wu, Yihan and Leng, Yichong and Chen, Jiawei and Liu, Haohe and Tan, Xu and Cui, Yang and Wang, Ke and He, Lei and Zhao, Sheng and Bian, Jiang and Mandic, Danilo},
title = {ResGrad: Residual Denoising Diffusion Probabilistic Models for Text to Speech},
year = {2022},
}
Coqui TTS on CPU Real-Time Spanish Speech Synthesis
CVSS: A Massively Multilingual Speech-to-Speech Translation Corpus
@inproceedings{jia2022cvss,
title={ {CVSS} Corpus and Massively Multilingual Speech-to-Speech Translation},
author={Jia, Ye and Tadmor Ramanovich, Michelle and Wang, Quan and Zen, Heiga},
booktitle={Proceedings of Language Resources and Evaluation Conference (LREC)},
pages={6691--6703},
year={2022}
}
FonBund: A Library for Combining Cross-lingual Phonological Segment Data
@inproceedings{46930,
title = {FonBund: A Library for Combining Cross-lingual Phonological Segment Data},
author = {Alexander Gutkin and Martin Jansche and Tatiana Merkulova},
year = {2018},
URL = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/8889.pdf},
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
pages = {2236--2240},
address = {7-12 May 2018, Miyazaki, Japan}
}
The Norwegian Parliamentary Speech Corpus
The Talk of Norway: a richly annotated corpus of the Norwegian parliament, 1998–2016
@article{lapponi_talk_2018,
title = {The {Talk} of {Norway}: a richly annotated corpus of the {Norwe
gian} parliament, 1998–2016},
volume = {52},
issn = {1574-0218},
url = {https://doi.org/10.1007/s10579-018-9411-5},
doi = {10.1007/s10579-018-9411-5},
number = {3},
journal = {Language Resources and Evaluation},
author = {Lapponi, Emanuele and Søyland, Martin G. and Velldal, Erik and Oepen, Stephan},
month = sep,
year = {2018},
pages = {873--893},
}
Counting in Northern Sami – French wiktionary seems to have good inflection information.
Weighted finite-state transducers: the later years
Minimally Supervised Number Normalization
@article{gorman-sproat-2016-minimally,
title = "Minimally Supervised Number Normalization",
author = "Gorman, Kyle and
Sproat, Richard",
journal = "Transactions of the Association for Computational Linguistics",
volume = "4",
year = "2016",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q16-1036",
doi = "10.1162/tacl_a_00114",
pages = "507--519",
}
Structured abbreviation expansion in context
@inproceedings{gorman-etal-2021-structured-abbreviation,
title = "Structured abbreviation expansion in context",
author = "Gorman, Kyle and
Kirov, Christo and
Roark, Brian and
Sproat, Richard",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.85",
doi = "10.18653/v1/2021.findings-emnlp.85",
pages = "995--1005",
}
FT Speech: Danish Parliament Speech Corpus
@inproceedings{Kirkedal_2020,
doi = {10.21437/interspeech.2020-3164},
year = 2020,
month = {oct},
publisher = {ISCA},
author = {Andreas Kirkedal and Marija Stepanovi{\'{c}} and Barbara Plank},
title = { {FT} Speech: Danish Parliament Speech Corpus},
booktitle = {Interspeech 2020}
}
Committee-Based Active Learning for Speech Recognition
@article{2011,
title={Committee-Based Active Learning for Speech Recognition},
author={Yuzo HAMANAKA and Koichi SHINODA and Takuya TSUTAOKA and Sadaoki FURUI and Tadashi EMORI and Takafumi KOSHINAKA},
journal={IEICE Transactions on Information and Systems},
volume={E94.D},
number={10},
pages={2015-2023},
year={2011},
doi={10.1587/transinf.E94.D.2015}
}
Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions
@misc{li2023phoneme_bert
doi = {10.48550/ARXIV.2301.08810},
author = {Li, Yinghao Aaron and Han, Cong and Jiang, Xilin and Mesgarani, Nima},
title = {Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions},
publisher = {arXiv},
year = {2023},
}
Multi-blank Transducers for Speech Recognition
@misc{xu2022multiblank,
doi = {10.48550/ARXIV.2211.03541},
author = {Xu, Hainan and Jia, Fei and Majumdar, Somshubra and Watanabe, Shinji and Ginsburg, Boris},
title = {Multi-blank Transducers for Speech Recognition},
publisher = {arXiv},
year = {2022},
}
Alpa: Automated Model-Parallel Deep Learning
Hearing voices at the National Library – a speech corpus and acoustic model for the Swedish language
@misc{malmsten2022kblabb_w2v,
url = {https://arxiv.org/abs/2205.03026},
author = {Malmsten, Martin and Haffenden, Chris and Börjeson, Love},
title = {Hearing voices at the National Library -- a speech corpus and acoustic model for the Swedish language},
publisher = {arXiv},
year = {2022},
}
Applications of Lexicographic Semirings to Problems in Speech and Language Processing, pdf
@article{10.1162/COLI_a_00198,
author = {Sproat, Richard and Yarmohammadi, Mahsa and Shafran, Izhak and Roark, Brian},
title = "{Applications of Lexicographic Semirings to Problems in Speech and Language Processing}",
journal = {Computational Linguistics},
volume = {40},
number = {4},
pages = {733-761},
year = {2014},
month = {12},
issn = {0891-2017},
doi = {10.1162/COLI_a_00198},
url = {https://doi.org/10.1162/COLI\_a\_00198},
}
Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization
@inproceedings{bakhturina22_interspeech,
author={Evelina Bakhturina and Yang Zhang and Boris Ginsburg},
title={ {Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={491--495},
doi={10.21437/Interspeech.2022-11074}
}
There is more to Hungarian than goulash!
@inproceedings{kocabiyikoglu-etal-2018-augmenting,
title = "Augmenting Librispeech with {F}rench Translations: A Multimodal Corpus for Direct Speech Translation Evaluation",
author = "Kocabiyikoglu, Ali Can and
Besacier, Laurent and
Kraif, Olivier",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1001",
}
End-to-End Automatic Speech Translation of Audiobooks, code
@misc{berard2018speechtranslation,
doi = {10.48550/ARXIV.1802.04200},
url = {https://arxiv.org/abs/1802.04200},
author = {Bérard, Alexandre and Besacier, Laurent and Kocabiyikoglu, Ali Can and Pietquin, Olivier},
title = {End-to-End Automatic Speech Translation of Audiobooks},
publisher = {arXiv},
year = {2018},
}
Adding Conditional Control to Text-to-Image Diffusion Models, code
@misc{zhang2023controlnet,
doi = {10.48550/ARXIV.2302.05543},
author = {Zhang, Lvmin and Agrawala, Maneesh},
title = {Adding Conditional Control to Text-to-Image Diffusion Models},
publisher = {arXiv},
year = {2023},
}
New Year Concert 2017 Wiener Philarmoniker Part 1
LiroyvH/signal-export – PDF friendly; carderne/signal-export – HTML
Fine-tune FLAN-T5 for chat & dialogue summarization
BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian Active Learning, BlackHC/batchbald_redux
@misc{kirsch2019batchbald,
doi = {10.48550/ARXIV.1906.08158},
author = {Kirsch, Andreas and van Amersfoort, Joost and Gal, Yarin},
title = {BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian Active Learning},
publisher = {arXiv},
year = {2019},
}
Dysarthric Speech Recognition From Raw Waveform with Parametric CNNs
@inproceedings{yue22_interspeech,
author={Zhengjun Yue and Erfan Loweimi and Heidi Christensen and Jon Barker and Zoran Cvetkovic},
title={ {Dysarthric Speech Recognition From Raw Waveform with Parametric CNNs}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={31--35},
doi={10.21437/Interspeech.2022-163}
}
Regularizing Transformer-based Acoustic Models by Penalizing Attention Weights
@inproceedings{lee22b_interspeech,
author={Munhak Lee and Joon-Hyuk Chang and Sang-Eon Lee and Ju-Seok Seong and Chanhee Park and Haeyoung Kwon},
title={ {Regularizing Transformer-based Acoustic Models by Penalizing Attention Weights}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={56--60},
doi={10.21437/Interspeech.2022-362}
}
Use of prosodic and lexical cues for disambiguating wh-words in Korean
@inproceedings{song22b_interspeech,
author={Jieun Song and Hae-Sung Jeon and Jieun Kiaer},
title={ {Use of prosodic and lexical cues for disambiguating wh-words in Korean}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={81--85},
doi={10.21437/Interspeech.2022-561}
}
Generalized Keyword Spotting using ASR embeddings
@inproceedings{r22_interspeech,
author={Kirandevraj R and Vinod Kumar Kurmi and Vinay Namboodiri and C V Jawahar},
title={ {Generalized Keyword Spotting using ASR embeddings}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={126--130},
doi={10.21437/Interspeech.2022-10450}
}
VoiceLab: Software for Fully Reproducible Automated Voice Analysis
@inproceedings{feinberg22_interspeech,
author={David Feinberg},
title={ {VoiceLab: Software for Fully Reproducible Automated Voice Analysis}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={351--355},
doi={10.21437/Interspeech.2022-113}
}
neonbjb/tortoise-tts – A multi-voice TTS system trained with an emphasis on quality
TorToiSe - Spending Compute for High Quality TTS
Parsing Icelandic Alþingi Transcripts: Parliamentary Speeches as a Genre
@inproceedings{runarsson-sigurdsson-2020-parsing,
title = "Parsing {I}celandic Al{\th}ingi Transcripts: Parliamentary Speeches as a Genre",
author = "R{\'u}narsson, Kristj{\'a}n and
Sigur{\dh}sson, Einar Freyr",
booktitle = "Proceedings of the Second ParlaCLARIN Workshop",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.parlaclarin-1.9",
pages = "44--50",
language = "English",
ISBN = "979-10-95546-47-4",
}
Unified Speech-Text Pre-training for Speech Translation and Recognition
@misc{tang2022unified,
doi = {10.48550/ARXIV.2204.05409},
author = {Tang, Yun and Gong, Hongyu and Dong, Ning and Wang, Changhan and Hsu, Wei-Ning and Gu, Jiatao and Baevski, Alexei and Li, Xian and Mohamed, Abdelrahman and Auli, Michael and Pino, Juan},
title = {Unified Speech-Text Pre-training for Speech Translation and Recognition},
year = {2022},
}
s2s-ft: Sequence-to-Sequence Fine-Tuning
SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data
@misc{zhang2022speechlm,
doi = {10.48550/ARXIV.2209.15329},
author = {Zhang, Ziqiang and Chen, Sanyuan and Zhou, Long and Wu, Yu and Ren, Shuo and Liu, Shujie and Yao, Zhuoyuan and Gong, Xun and Dai, Lirong and Li, Jinyu and Wei, Furu},
title = {SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data},
year = {2022},
}
Active learning in speech recognition
open-mmlab/mmhuman3d – OpenMMLab 3D Human Parametric Model Toolbox and Benchmark
zapis liczb wielocyfrowych – spacjami
r9y9/pyreaper – A python wrapper for REAPER
Fine-tune FLAN-T5 for chat & dialogue summarization
Getting Started with DeepSpeed for Inferencing Transformer based Models
facebookresearch/metaseq – Repo for external large-scale work
Learning Audio-Video Modalities from Image Captions, github
@inproceedings{nagrani2022learning,
title = {Learning Audio Video Modalities from Image Captions},
author = {Nagrani, Arsha and Hongsuck Seo, Paul and Seybold, Bryan, and Hauth Anja, and Santiago, Manen, and Chen, Sun and Schmid, Cordelia},
booktitle = {ECCV},
year = {2022},
}
Serving OPT-175B, BLOOM-176B and CodeGen-16B using Alpa
k2-fsa/sherpa – Speech-to-text server framework with next-gen Kaldi
k2-fsa/kaldifst – Python wrapper for OpenFST and its extensions from Kaldi. Also support reading/writing ark/scp files
NeMo Joint Intent and Slot Classification
jonatasgrosman/wav2vec2-large-xlsr-53-hungarian
NbAiLab/whisper-sami-demo, model
Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition
@misc{majumdar2022damagecontrol,
doi = {10.48550/ARXIV.2210.03255},
author = {Majumdar, Somshubra and Acharya, Shantanu and Lavrukhin, Vitaly and Ginsburg, Boris},
title = {Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition},
year = {2022},
}
jonatasgrosman/wav2vec2-large-xlsr-53-hungarian
vitouphy/wav2vec2-xls-r-300m-phoneme, training
lucidrains/audiolm-pytorch – Implementation of AudioLM, a SOTA Language Modeling Approach to Audio Generation out of Google Research, in Pytorch
lucidrains/PaLM-rlhf-pytorch – Implementation of RLHF (Reinforcement Learning with Human Feedback) on top of the PaLM architecture. Basically ChatGPT but with PaLM
google-research/tuning_playbook – A playbook for systematically maximizing the performance of deep learning models.
Castles and palaces of Greater Budapest
OpenAI’s Whisper: 7 must-know libraries and add-ons built on top of it
Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers
jumon/zac – Zero-shot Audio Classification using Whisper
linto-ai/whisper-timestamped – Multilingual Automatic Speech Recognition with word-level timestamps and confidence
Guiding Frozen Language Models with Learned Soft Prompts
google-research/prompt-tuning – Original Implementation of Prompt Tuning from Lester, et al, 2021
The Flan Collection: Advancing open source methods for instruction tuning
Active and Semi-Supervised Learning in ASR: Benefits on the Acoustic and Language Models
@article{drugman2019active,
doi = {10.48550/ARXIV.1903.02852},
author = {Drugman, Thomas and Pylkkonen, Janne and Kneser, Reinhard},
title = {Active and Semi-Supervised Learning in ASR: Benefits on the Acoustic and Language Models},
year = {2019},
}
Domain-Adversarial Training of Neural Networks
@article{ganin2015domainadversarial,
doi = {10.48550/ARXIV.1505.07818},
author = {Ganin, Yaroslav and Ustinova, Evgeniya and Ajakan, Hana and Germain, Pascal and Larochelle, Hugo and Laviolette, François and Marchand, Mario and Lempitsky, Victor},
title = {Domain-Adversarial Training of Neural Networks},
year = {2015},
}
CLSE: Corpus of Linguistically Significant Entities, corpus
@misc{chuklin2022clse,
doi = {10.48550/ARXIV.2211.02423},
author = {Chuklin, Aleksandr and Zhao, Justin and Kale, Mihir},
title = {CLSE: Corpus of Linguistically Significant Entities},
year = {2022},
}
@inProceedings{angsal2022framing,
title = {Linguistic Framing of Political Terror: Distant and Close Readings of the Discourse on Terrorism in the Swedish Parliament 1993–2018},
booktitle = {CLARIN Annual Conference Proceedings, 10–12 October 2022, Prague, Czechia. Eds. Tomaž Erjavec & Maria Eskevich},
author = {Ängsal, Magnus Pettersson and Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Öhberg, Patrik},
year = {2022},
address = {Prag},
}
Finland Swedish Automatic Speech Recognition, pdf
@mastersthesis{raitolahti2022,
title={ {Finland Swedish Automatic Speech Recognition}},
author={Raitolahti, Otto-Ville},
year={2022},
language={English},
pages={53},
school={Aalto University. School of Science},
type={Master's thesis},
url={http://urn.fi/URN:NBN:fi:aalto-202203272601}
}
Building an ASR Corpus Using Althingi’s Parliamentary Speeches
@inproceedings{helgadottir2017,
author={Inga Rún Helgadóttir and Róbert Kjaran and Anna Björk Nikulásdóttir and Jón Guðnason},
title={Building an ASR Corpus Using Althingi’s Parliamentary Speeches},
year=2017,
booktitle={Proc. Interspeech 2017},
pages={2163--2167},
doi={10.21437/Interspeech.2017-903},
url={http://dx.doi.org/10.21437/Interspeech.2017-903}
}
Streaming model for Acoustic to Articulatory Inversion with transformer networks, pdf
@inproceedings{udupa22_interspeech,
author={Sathvik Udupa and Aravind Illa and Prasanta Ghosh},
title={ {Streaming model for Acoustic to Articulatory Inversion with transformer networks}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={625--629},
doi={10.21437/Interspeech.2022-10159}
}
Acquisition of allophonic variation in second language speech: An acoustic and articulatory study of English laterals by Japanese speakers, pdf
@inproceedings{nagamine22_interspeech,
author={Takayuki Nagamine},
title={ {Acquisition of allophonic variation in second language speech: An acoustic and articulatory study of English laterals by Japanese speakers}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={644--648},
doi={10.21437/Interspeech.2022-11020}
}
Unsupervised Text-to-Speech Synthesis by Unsupervised Automatic Speech Recognition, pdf
@inproceedings{ni22_interspeech,
author={Junrui Ni and Liming Wang and Heting Gao and Kaizhi Qian and Yang Zhang and Shiyu Chang and Mark Hasegawa-Johnson},
title={ {Unsupervised Text-to-Speech Synthesis by Unsupervised Automatic Speech Recognition}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={461--465},
doi={10.21437/Interspeech.2022-816}
}
Mixed-Phoneme BERT: Improving BERT with Mixed Phoneme and Sup-Phoneme Representations for Text to Speech, pdf
@inproceedings{zhang22i_interspeech,
author={Guangyan Zhang and Kaitao Song and Xu Tan and Daxin Tan and Yuzi Yan and Yanqing Liu and Gang Wang and Wei Zhou and Tao Qin and Tan Lee and Sheng Zhao},
title={ {Mixed-Phoneme BERT: Improving BERT with Mixed Phoneme and Sup-Phoneme Representations for Text to Speech}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={456--460},
doi={10.21437/Interspeech.2022-621}
}
Active learning in speech recognition
Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models
@misc{vijayakumar2016diversebeamsearch,
doi = {10.48550/ARXIV.1610.02424},
author = {Vijayakumar, Ashwin K and Cogswell, Michael and Selvaraju, Ramprasath R. and Sun, Qing and Lee, Stefan and Crandall, David and Batra, Dhruv},
title = {Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models},
year = {2016},
}
Towards Multimodal Sarcasm Detection (An Obviously Perfect Paper)
@inproceedings{castro-etal-2019-towards,
title = "Towards Multimodal Sarcasm Detection (An {\_}{O}bviously{\_} Perfect Paper)",
author = "Castro, Santiago and
Hazarika, Devamanyu and
P{\'e}rez-Rosas, Ver{\'o}nica and
Zimmermann, Roger and
Mihalcea, Rada and
Poria, Soujanya",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1455",
doi = "10.18653/v1/P19-1455",
pages = "4619--4629",
}
jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli
facebook/wav2vec2-xlsr-53-phon-cv-babel-ft
Aditya3107/wav2vec2-Irish-common-voice-Fleurs-living-audio-300m
microsoft/trocr-large-handwritten
TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models
@misc{li2021trocr,
doi = {10.48550/ARXIV.2109.10282},
author = {Li, Minghao and Lv, Tengchao and Chen, Jingye and Cui, Lei and Lu, Yijuan and Florencio, Dinei and Zhang, Cha and Li, Zhoujun and Wei, Furu},
title = {TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models},
year = {2021},
}
Pauses, gaps and overlaps in conversations
dmort27/morphotactics – Library for implementing morphotactic FSTs using Pynini and OpenFST
25 Hungarian Words that every foreigner should learn
LMC-SMCA: A New Active Learning Method in ASR
@ARTICLE{9363163,
author={Sun, Xiusong and Wang, Bo and Liu, Shaohan and Lu, Tingxiang and Shan, Xin and Yang, Qun},
journal={IEEE Access},
title={LMC-SMCA: A New Active Learning Method in ASR},
year={2021},
volume={9},
number={},
pages={37011-37021},
doi={10.1109/ACCESS.2021.3062157}}
Active Learning For Automatic Speech Recognition
@article{hakkanitur2002active,
author = {Hakkani-Tur, Dilek and Gorin, Allen},
year = {2002},
month = {09},
pages = {},
title = {Active Learning For Automatic Speech Recognition},
journal = {Acoustics, Speech, and Signal Processing, 1988. ICASSP-88., 1988 International Conference on},
doi = {10.1109/ICASSP.2002.5745510}
}
Maximizing global entropy reduction for active learning in speech recognition
@INPROCEEDINGS{4960685,
author={Varadarajan, Balakrishnan and Yu, Dong and Li Deng and Acero, Alex},
booktitle={2009 IEEE International Conference on Acoustics, Speech and Signal Processing},
title={Maximizing global entropy reduction for active learning in speech recognition},
year={2009},
volume={},
number={},
pages={4721-4724},
doi={10.1109/ICASSP.2009.4960685}}
Active learning for accent adaptation in Automatic Speech Recognition
@INPROCEEDINGS{6424250,
author={Nallasamy, Udhyakumar and Metze, Florian and Schultz, Tanja},
booktitle={2012 IEEE Spoken Language Technology Workshop (SLT)},
title={Active learning for accent adaptation in Automatic Speech Recognition},
year={2012},
volume={},
number={},
pages={360-365},
doi={10.1109/SLT.2012.6424250}}
Active learning: theory and applications to automatic speech recognition
@ARTICLE{1453593,
author={Riccardi, G. and Hakkani-Tur, D.},
journal={IEEE Transactions on Speech and Audio Processing},
title={Active learning: theory and applications to automatic speech recognition},
year={2005},
volume={13},
number={4},
pages={504-511},
doi={10.1109/TSA.2005.848882}}
A confusion network based confidence measure for active learning in speech recognition
@INPROCEEDINGS{4906813,
author={Chen, Wei and Liu, Gang and Guo, Jun},
booktitle={2008 International Conference on Natural Language Processing and Knowledge Engineering},
title={A confusion network based confidence measure for active learning in speech recognition},
year={2008},
volume={},
number={},
pages={1-6},
doi={10.1109/NLPKE.2008.4906813}}
Active learning for automatic speech recognition
@INPROCEEDINGS{5745510,
author={Hakkani-Tür, Dilek and Riccardi, Giuseppe and Gorin, Allen},
booktitle={2002 IEEE International Conference on Acoustics, Speech, and Signal Processing},
title={Active learning for automatic speech recognition},
year={2002},
volume={4},
number={},
pages={IV-3904-IV-3907},
doi={10.1109/ICASSP.2002.5745510}}
Overview of Active Learning for Deep Learning
SpeechPainter: Text-conditioned Speech Inpainting
@inproceedings{borsos22_interspeech,
author={Zalan Borsos and Matthew Sharifi and Marco Tagliasacchi},
title={ {SpeechPainter: Text-conditioned Speech Inpainting}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={431--435},
doi={10.21437/Interspeech.2022-194}
}
Active and unsupervised learning for automatic speech recognition, pdf
@inproceedings{riccardi03_eurospeech,
author={Giuseppe Riccardi and Dilek Z. Hakkani-Tur},
title={ {Active and unsupervised learning for automatic speech recognition}},
year=2003,
booktitle={Proc. 8th European Conference on Speech Communication and Technology (Eurospeech 2003)},
pages={1825--1828},
doi={10.21437/Eurospeech.2003-552}
}
Committee-Based Active Learning for Speech Recognition, pdf
@article{hamanaka2011committee,
title={Committee-Based Active Learning for Speech Recognition},
author={Yuzo HAMANAKA and Koichi SHINODA and Takuya TSUTAOKA and Sadaoki FURUI and Tadashi EMORI and Takafumi KOSHINAKA},
journal={IEICE Transactions on Information and Systems},
volume={E94.D},
number={10},
pages={2015-2023},
year={2011},
doi={10.1587/transinf.E94.D.2015}
}
Adaptable End-to-End ASR Models using Replaceable Internal LMs and Residual Softmax
@misc{deng2023adaptableasr,
doi = {10.48550/ARXIV.2302.08579},
author = {Deng, Keqi and Woodland, Philip C.},
title = {Adaptable End-to-End ASR Models using Replaceable Internal LMs and Residual Softmax},
year = {2023},
}