[ { "Word Error Rate (WER)": 5, "code_links": [ { "title": "rwth-i6/returnn", "url": "https://github.com/rwth-i6/returnn" }, { "title": "rwth-i6/returnn-experiments", "url": "https://github.com/rwth-i6/returnn-experiments/tree/master/2019-librispeech-system" } ], "date": "2019-05-08", "date2": 20190508, "model": "Hybrid model with Transformer rescoring", "paper": { "title": "RWTH ASR Systems for LibriSpeech: Hybrid vs Attention -- w/o Data Augmentation", "url": "https://cknow.io/lib/3c4bbcadf9194f9e" }, "paper_data_uoa": "3c4bbcadf9194f9e" }, { "Word Error Rate (WER)": 5.7, "code_links": [ { "title": "rwth-i6/returnn", "url": "https://github.com/rwth-i6/returnn" }, { "title": "rwth-i6/returnn-experiments", "url": "https://github.com/rwth-i6/returnn-experiments/tree/master/2019-librispeech-system" } ], "date": "2019-05-08", "date2": 20190508, "model": "6 layer BLSTM with LSTM transformer rescoring", "paper": { "title": "RWTH ASR Systems for LibriSpeech: Hybrid vs Attention -- w/o Data Augmentation", "url": "https://cknow.io/lib/3c4bbcadf9194f9e" }, "paper_data_uoa": "3c4bbcadf9194f9e" }, { "Word Error Rate (WER)": 5.7, "code_links": [ { "title": "espnet/espnet", "url": "https://github.com/espnet/espnet" } ], "date": "2019-09-13", "date2": 20190913, "model": "Transformer", "paper": { "title": "A Comparative Study on Transformer vs RNN in Speech Applications", "url": "https://cknow.io/lib/d68911301970abbe" }, "paper_data_uoa": "d68911301970abbe" }, { "Word Error Rate (WER)": 5.8, "code_links": [ { "title": "mozilla/DeepSpeech", "url": "https://github.com/mozilla/DeepSpeech" }, { "title": "shelling203/SpecAugment", "url": "https://github.com/shelling203/SpecAugment" }, { "title": "DemisEom/SpecAugment", "url": "https://github.com/DemisEom/SpecAugment" }, { "title": "lRomul/argus-freesound", "url": "https://github.com/lRomul/argus-freesound" }, { "title": "ebouteillon/freesound-audio-tagging-2019", "url": "https://github.com/ebouteillon/freesound-audio-tagging-2019" }, { "title": "sh951011/Korean-Speech-Recognition", "url": "https://github.com/sh951011/Korean-Speech-Recognition/blob/master/package/feature.py" }, { "title": "sh951011/Korean-Speech-Recognition", "url": "https://github.com/sh951011/Korean-Speech-Recognition" }, { "title": "KimJeongSun/SpecAugment_numpy_scipy", "url": "https://github.com/KimJeongSun/SpecAugment_numpy_scipy" }, { "title": "hgstudent/las", "url": "https://github.com/hgstudent/las" }, { "title": "viig99/mixmatch-freesound", "url": "https://github.com/viig99/mixmatch-freesound" }, { "title": "knlee-voice/PaperNotes", "url": "https://github.com/knlee-voice/PaperNotes" } ], "date": "2019-04-18", "date2": 20190418, "model": "LAS + SpecAugment", "paper": { "title": "SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition", "url": "https://cknow.io/lib/15408ea14d4c82f9" }, "paper_data_uoa": "15408ea14d4c82f9" }, { "Word Error Rate (WER)": 5.8, "code_links": [ { "title": "s-omranpour/ConvolutionalSpeechRecognition", "url": "https://github.com/s-omranpour/ConvolutionalSpeechRecognition" } ], "date": "2019-10-01", "date2": 20191001, "model": "Multi-Stream Self-Attention With Dilated 1D Convolutions", "paper": { "title": "State-of-the-Art Speech Recognition Using Multi-Stream Self-Attention With Dilated 1D Convolutions", "url": "https://cknow.io/lib/35a2912594df0b5b" }, "paper_data_uoa": "35a2912594df0b5b" }, { "Word Error Rate (WER)": 7.63, "code_links": [], "date": "2018-04-15", "date2": 20180415, "model": "tdnn + chain + rnnlm rescoring", "paper": { "title": "Neural Network Language Modeling with Letter-based Features and Importance Sampling", "url": "https://cknow.io/lib/5449c55a2744f069" }, "paper_data_uoa": "5449c55a2744f069" }, { "Word Error Rate (WER)": 8.79, "code_links": [ { "title": "NVIDIA/OpenSeq2Seq", "url": "https://github.com/NVIDIA/OpenSeq2Seq" } ], "date": "2019-04-05", "date2": 20190405, "model": "deep 1d convs + ctc + external lm rescoring", "paper": { "title": "Jasper: An End-to-End Convolutional Neural Acoustic Model", "url": "https://cknow.io/lib/d52115ec6c741b09" }, "paper_data_uoa": "d52115ec6c741b09" }, { "Word Error Rate (WER)": 10.47, "code_links": [], "date": "2018-12-17", "date2": 20181217, "model": "Convolutional Speech Recognition", "paper": { "title": "Fully Convolutional Speech Recognition", "url": "https://cknow.io/lib/d1db60cb34872a82" }, "paper_data_uoa": "d1db60cb34872a82" }, { "Word Error Rate (WER)": 13.25, "code_links": [ { "title": "tensorflow/models", "url": "https://github.com/tensorflow/models/tree/master/research/deep_speech" }, { "title": "PaddlePaddle/models", "url": "https://github.com/PaddlePaddle/models" }, { "title": "baidu-research/warp-ctc", "url": "https://github.com/baidu-research/warp-ctc" }, { "title": "SeanNaren/deepspeech.pytorch", "url": "https://github.com/SeanNaren/deepspeech.pytorch" }, { "title": "DeepMark/deepmark", "url": "https://github.com/DeepMark/deepmark" }, { "title": "SeanNaren/deepspeech.torch", "url": "https://github.com/SeanNaren/deepspeech.torch" }, { "title": "robmsmt/KerasDeepSpeech", "url": "https://github.com/robmsmt/KerasDeepSpeech" }, { "title": "myrtleSoftware/deepspeech", "url": "https://github.com/myrtleSoftware/deepspeech" }, { "title": "noahchalifour/deepspeech2-tensorflow", "url": "https://github.com/noahchalifour/deepspeech2-tensorflow" }, { "title": "UnofficialJuliaMirror/DeepMark-deepmark", "url": "https://github.com/UnofficialJuliaMirror/DeepMark-deepmark" }, { "title": "UnofficialJuliaMirrorSnapshots/DeepMark-deepmark", "url": "https://github.com/UnofficialJuliaMirrorSnapshots/DeepMark-deepmark" }, { "title": "huylenguyen806/vnasr", "url": "https://github.com/huylenguyen806/vnasr" } ], "date": "2015-12-08", "date2": 20151208, "model": "Deep Speech 2", "paper": { "title": "Deep Speech 2: End-to-End Speech Recognition in English and Mandarin", "url": "https://cknow.io/lib/ae2bdef963023028" }, "paper_data_uoa": "ae2bdef963023028" }, { "Word Error Rate (WER)": 15.28, "code_links": [ { "title": "facebookresearch/wav2letter", "url": "https://github.com/facebookresearch/wav2letter" } ], "date": "2020-02-24", "date2": 20200224, "model": "Local Prior Matching (Large Model, ConvLM LM)", "paper": { "title": "Semi-Supervised Speech Recognition via Local Prior Matching", "url": "https://cknow.io/lib/85675128034707b7" }, "paper_data_uoa": "85675128034707b7" }, { "Word Error Rate (WER)": 16.5, "code_links": [ { "title": "snipsco/snips-nlu", "url": "https://github.com/snipsco/snips-nlu" }, { "title": "snipsco/nlu-benchmark", "url": "https://github.com/snipsco/nlu-benchmark" }, { "title": "MiuLab/SlotGated-SLU", "url": "https://github.com/MiuLab/SlotGated-SLU" } ], "date": "2018-05-25", "date2": 20180525, "model": "Snips", "paper": { "title": "Snips Voice Platform: an embedded Spoken Language Understanding system for private-by-design voice interfaces", "url": "https://cknow.io/lib/4466ad0dd77ea158" }, "paper_data_uoa": "4466ad0dd77ea158" }, { "Word Error Rate (WER)": 20.84, "code_links": [ { "title": "facebookresearch/wav2letter", "url": "https://github.com/facebookresearch/wav2letter" } ], "date": "2020-02-24", "date2": 20200224, "model": "Local Prior Matching (Large Model)", "paper": { "title": "Semi-Supervised Speech Recognition via Local Prior Matching", "url": "https://cknow.io/lib/85675128034707b7" }, "paper_data_uoa": "85675128034707b7" } ]