[ { "Word Error Rate (WER)": 2.2, "code_links": [ { "title": "s-omranpour/ConvolutionalSpeechRecognition", "url": "https://github.com/s-omranpour/ConvolutionalSpeechRecognition" } ], "date": "2019-10-01", "date2": 20191001, "model": "Multi-Stream Self-Attention With Dilated 1D Convolutions", "paper": { "title": "State-of-the-Art Speech Recognition Using Multi-Stream Self-Attention With Dilated 1D Convolutions", "url": "https://cknow.io/lib/35a2912594df0b5b" }, "paper_data_uoa": "35a2912594df0b5b" }, { "Word Error Rate (WER)": 2.3, "code_links": [ { "title": "rwth-i6/returnn", "url": "https://github.com/rwth-i6/returnn" }, { "title": "rwth-i6/returnn-experiments", "url": "https://github.com/rwth-i6/returnn-experiments/tree/master/2019-librispeech-system" } ], "date": "2019-05-08", "date2": 20190508, "model": "Hybrid model with Transformer rescoring", "paper": { "title": "RWTH ASR Systems for LibriSpeech: Hybrid vs Attention -- w/o Data Augmentation", "url": "https://cknow.io/lib/3c4bbcadf9194f9e" }, "paper_data_uoa": "3c4bbcadf9194f9e" }, { "Word Error Rate (WER)": 2.5, "code_links": [ { "title": "mozilla/DeepSpeech", "url": "https://github.com/mozilla/DeepSpeech" }, { "title": "shelling203/SpecAugment", "url": "https://github.com/shelling203/SpecAugment" }, { "title": "DemisEom/SpecAugment", "url": "https://github.com/DemisEom/SpecAugment" }, { "title": "lRomul/argus-freesound", "url": "https://github.com/lRomul/argus-freesound" }, { "title": "ebouteillon/freesound-audio-tagging-2019", "url": "https://github.com/ebouteillon/freesound-audio-tagging-2019" }, { "title": "sh951011/Korean-Speech-Recognition", "url": "https://github.com/sh951011/Korean-Speech-Recognition/blob/master/package/feature.py" }, { "title": "sh951011/Korean-Speech-Recognition", "url": "https://github.com/sh951011/Korean-Speech-Recognition" }, { "title": "KimJeongSun/SpecAugment_numpy_scipy", "url": "https://github.com/KimJeongSun/SpecAugment_numpy_scipy" }, { "title": "hgstudent/las", "url": "https://github.com/hgstudent/las" }, { "title": "viig99/mixmatch-freesound", "url": "https://github.com/viig99/mixmatch-freesound" }, { "title": "knlee-voice/PaperNotes", "url": "https://github.com/knlee-voice/PaperNotes" } ], "date": "2019-04-18", "date2": 20190418, "model": "LAS + SpecAugment ", "paper": { "title": "SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition", "url": "https://cknow.io/lib/15408ea14d4c82f9" }, "paper_data_uoa": "15408ea14d4c82f9" }, { "Word Error Rate (WER)": 2.6, "code_links": [ { "title": "espnet/espnet", "url": "https://github.com/espnet/espnet" } ], "date": "2019-09-13", "date2": 20190913, "model": "Transformer", "paper": { "title": "A Comparative Study on Transformer vs RNN in Speech Applications", "url": "https://cknow.io/lib/d68911301970abbe" }, "paper_data_uoa": "d68911301970abbe" }, { "Word Error Rate (WER)": 2.7, "code_links": [ { "title": "rwth-i6/returnn", "url": "https://github.com/rwth-i6/returnn" }, { "title": "rwth-i6/returnn-experiments", "url": "https://github.com/rwth-i6/returnn-experiments/tree/master/2019-librispeech-system" } ], "date": "2019-05-08", "date2": 20190508, "model": "6 layer BLSTM with LSTM transformer rescoring", "paper": { "title": "RWTH ASR Systems for LibriSpeech: Hybrid vs Attention -- w/o Data Augmentation", "url": "https://cknow.io/lib/3c4bbcadf9194f9e" }, "paper_data_uoa": "3c4bbcadf9194f9e" }, { "Percentage error": 2.95, "Word Error Rate (WER)": 2.95, "code_links": [ { "title": "NVIDIA/OpenSeq2Seq", "url": "https://github.com/NVIDIA/OpenSeq2Seq" } ], "date": "2019-04-05", "date2": 20190405, "model": "deep 1d convs + ctc + external lm rescoring", "paper": { "title": "Jasper: An End-to-End Convolutional Neural Acoustic Model", "url": "https://cknow.io/lib/d52115ec6c741b09" }, "paper_data_uoa": "d52115ec6c741b09" }, { "Percentage error": 3.06, "Word Error Rate (WER)": 3.06, "code_links": [], "date": "2018-04-15", "date2": 20180415, "model": "tdnn + chain + rnnlm rescoring", "paper": { "title": "Neural Network Language Modeling with Letter-based Features and Importance Sampling", "url": "https://cknow.io/lib/5449c55a2744f069" }, "paper_data_uoa": "5449c55a2744f069" }, { "Word Error Rate (WER)": 3.2, "code_links": [ { "title": "mozilla/DeepSpeech", "url": "https://github.com/mozilla/DeepSpeech" }, { "title": "shelling203/SpecAugment", "url": "https://github.com/shelling203/SpecAugment" }, { "title": "DemisEom/SpecAugment", "url": "https://github.com/DemisEom/SpecAugment" }, { "title": "lRomul/argus-freesound", "url": "https://github.com/lRomul/argus-freesound" }, { "title": "ebouteillon/freesound-audio-tagging-2019", "url": "https://github.com/ebouteillon/freesound-audio-tagging-2019" }, { "title": "sh951011/Korean-Speech-Recognition", "url": "https://github.com/sh951011/Korean-Speech-Recognition/blob/master/package/feature.py" }, { "title": "sh951011/Korean-Speech-Recognition", "url": "https://github.com/sh951011/Korean-Speech-Recognition" }, { "title": "KimJeongSun/SpecAugment_numpy_scipy", "url": "https://github.com/KimJeongSun/SpecAugment_numpy_scipy" }, { "title": "hgstudent/las", "url": "https://github.com/hgstudent/las" }, { "title": "viig99/mixmatch-freesound", "url": "https://github.com/viig99/mixmatch-freesound" }, { "title": "knlee-voice/PaperNotes", "url": "https://github.com/knlee-voice/PaperNotes" } ], "date": "2019-04-18", "date2": 20190418, "model": "LAS", "paper": { "title": "SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition", "url": "https://cknow.io/lib/15408ea14d4c82f9" }, "paper_data_uoa": "15408ea14d4c82f9" }, { "Word Error Rate (WER)": 3.26, "code_links": [], "date": "2018-12-17", "date2": 20181217, "model": "Convolutional Speech Recognition", "paper": { "title": "Fully Convolutional Speech Recognition", "url": "https://cknow.io/lib/d1db60cb34872a82" }, "paper_data_uoa": "d1db60cb34872a82" }, { "Word Error Rate (WER)": 3.6, "code_links": [ { "title": "colaprograms/speechify", "url": "https://github.com/colaprograms/speechify" } ], "date": "2019-02-05", "date2": 20190205, "model": "Model Unit Exploration", "paper": { "title": "On the Choice of Modeling Unit for Sequence-to-Sequence Speech Recognition", "url": "https://cknow.io/lib/a8d18da42e5bf4b2" }, "paper_data_uoa": "a8d18da42e5bf4b2" }, { "Percentage error": 3.82, "Word Error Rate (WER)": 3.82, "code_links": [ { "title": "rwth-i6/returnn", "url": "https://github.com/rwth-i6/returnn" }, { "title": "rwth-i6/returnn-experiments", "url": "https://github.com/rwth-i6/returnn-experiments" }, { "title": "pvsimoes/our_espnet", "url": "https://github.com/pvsimoes/our_espnet" } ], "date": "2018-05-08", "date2": 20180508, "model": "Seq-to-seq attention", "paper": { "title": "Improved training of end-to-end attention models for speech recognition", "url": "https://cknow.io/lib/fea16774f95979e5" }, "paper_data_uoa": "fea16774f95979e5" }, { "Percentage error": 4.8, "Word Error Rate (WER)": 4.8, "code_links": [ { "title": "9dwLab/Wav2Letter", "url": "https://github.com/9dwLab/Wav2Letter" }, { "title": "eric-erki/wav2letter", "url": "https://github.com/eric-erki/wav2letter" }, { "title": "MrMao/wav2letter", "url": "https://github.com/MrMao/wav2letter" } ], "date": "2017-12-22", "date2": 20171222, "model": "Gated ConvNets", "paper": { "title": "Letter-Based Speech Recognition with Gated ConvNets", "url": "https://cknow.io/lib/a1ae4626026eeab0" }, "paper_data_uoa": "a1ae4626026eeab0" }, { "Percentage error": 5.33, "Word Error Rate (WER)": 5.33, "code_links": [ { "title": "tensorflow/models", "url": "https://github.com/tensorflow/models/tree/master/research/deep_speech" }, { "title": "PaddlePaddle/models", "url": "https://github.com/PaddlePaddle/models" }, { "title": "baidu-research/warp-ctc", "url": "https://github.com/baidu-research/warp-ctc" }, { "title": "SeanNaren/deepspeech.pytorch", "url": "https://github.com/SeanNaren/deepspeech.pytorch" }, { "title": "DeepMark/deepmark", "url": "https://github.com/DeepMark/deepmark" }, { "title": "SeanNaren/deepspeech.torch", "url": "https://github.com/SeanNaren/deepspeech.torch" }, { "title": "robmsmt/KerasDeepSpeech", "url": "https://github.com/robmsmt/KerasDeepSpeech" }, { "title": "myrtleSoftware/deepspeech", "url": "https://github.com/myrtleSoftware/deepspeech" }, { "title": "noahchalifour/deepspeech2-tensorflow", "url": "https://github.com/noahchalifour/deepspeech2-tensorflow" }, { "title": "UnofficialJuliaMirror/DeepMark-deepmark", "url": "https://github.com/UnofficialJuliaMirror/DeepMark-deepmark" }, { "title": "UnofficialJuliaMirrorSnapshots/DeepMark-deepmark", "url": "https://github.com/UnofficialJuliaMirrorSnapshots/DeepMark-deepmark" }, { "title": "huylenguyen806/vnasr", "url": "https://github.com/huylenguyen806/vnasr" } ], "date": "2015-12-08", "date2": 20151208, "model": "Deep Speech 2", "paper": { "title": "Deep Speech 2: End-to-End Speech Recognition in English and Mandarin", "url": "https://cknow.io/lib/ae2bdef963023028" }, "paper_data_uoa": "ae2bdef963023028" }, { "Percentage error": 5.42, "Word Error Rate (WER)": 5.42, "code_links": [], "date": "2017-12-19", "date2": 20171219, "model": "CTC + policy learning", "paper": { "title": "Improving End-to-End Speech Recognition with Policy Learning", "url": "https://cknow.io/lib/2f2e4513c004ee39" }, "paper_data_uoa": "2f2e4513c004ee39" }, { "Percentage error": 6.2, "Word Error Rate (WER)": 6.2, "code_links": [ { "title": "mravanelli/pytorch-kaldi", "url": "https://github.com/mravanelli/pytorch-kaldi" }, { "title": "xpz123/pytorch-kaldi", "url": "https://github.com/xpz123/pytorch-kaldi" } ], "date": "2018-11-19", "date2": 20181119, "model": "Li-GRU", "paper": { "title": "The PyTorch-Kaldi Speech Recognition Toolkit", "url": "https://cknow.io/lib/437631894186a91a" }, "paper_data_uoa": "437631894186a91a" }, { "Percentage error": 6.4, "Word Error Rate (WER)": 6.4, "code_links": [ { "title": "snipsco/snips-nlu", "url": "https://github.com/snipsco/snips-nlu" }, { "title": "snipsco/nlu-benchmark", "url": "https://github.com/snipsco/nlu-benchmark" }, { "title": "MiuLab/SlotGated-SLU", "url": "https://github.com/MiuLab/SlotGated-SLU" } ], "date": "2018-05-25", "date2": 20180525, "model": "Snips", "paper": { "title": "Snips Voice Platform: an embedded Spoken Language Understanding system for private-by-design voice interfaces", "url": "https://cknow.io/lib/4466ad0dd77ea158" }, "paper_data_uoa": "4466ad0dd77ea158" }, { "Percentage error": 20.84, "Word Error Rate (WER)": 7.19, "code_links": [ { "title": "facebookresearch/wav2letter", "url": "https://github.com/facebookresearch/wav2letter" } ], "date": "2020-02-24", "date2": 20200224, "model": "Local Prior Matching (Large Model)", "paper": { "title": "Semi-Supervised Speech Recognition via Local Prior Matching", "url": "https://cknow.io/lib/85675128034707b7" }, "paper_data_uoa": "85675128034707b7" } ]