[ { "Bit per Character (BPC)": 0.93, "Number of params": "1542M", "code_links": [ { "title": "huggingface/transformers", "url": "https://github.com/huggingface/transformers" }, { "title": "openai/gpt-2", "url": "https://github.com/openai/gpt-2" }, { "title": "minimaxir/gpt-2-simple", "url": "https://github.com/minimaxir/gpt-2-simple" }, { "title": "huggingface/swift-coreml-transformers", "url": "https://github.com/huggingface/swift-coreml-transformers" }, { "title": "imcaspar/gpt2-ml", "url": "https://github.com/imcaspar/gpt2-ml" } ], "date": "2019-02-14", "date2": 20190214, "model": "GPT-2x", "paper": { "title": "Language Models are Unsupervised Multitask Learners", "url": "https://cknow.io/lib/2a5eff987ab8d8ec" }, "paper_data_uoa": "2a5eff987ab8d8ec" }, { "Bit per Character (BPC)": 0.94, "Number of params": "277M", "code_links": [ { "title": "benkrause/dynamiceval-transformer", "url": "https://github.com/benkrause/dynamiceval-transformer" } ], "date": "2019-04-17", "date2": 20190417, "model": "Transformer-XL + RMS dynamic eval + decayx", "paper": { "title": "Dynamic Evaluation of Transformer Language Models", "url": "https://cknow.io/lib/a0c795a840eaba5d" }, "paper_data_uoa": "a0c795a840eaba5d" }, { "Bit per Character (BPC)": 0.96, "Number of params": "77M", "code_links": [], "date": "2020-02-21", "date2": 20200221, "model": "Feedback Transformer", "paper": { "title": "Accessing Higher-level Representations in Sequential Transformers with Feedback Memory", "url": "https://cknow.io/lib/e4bac64d03507839" }, "paper_data_uoa": "e4bac64d03507839" }, { "Bit per Character (BPC)": 0.97, "code_links": [], "date": "2020-01-01", "date2": 20200101, "model": "24L Compressive Transformer", "paper": { "title": "Compressive Transformers for Long-Range Sequence Modelling", "url": "https://cknow.io/lib/f74fd9acd5a777cf" }, "paper_data_uoa": "f74fd9acd5a777cf" }, { "Bit per Character (BPC)": 0.98, "Number of params": "209M", "code_links": [ { "title": "facebookresearch/adaptive-span", "url": "https://github.com/facebookresearch/adaptive-span" } ], "date": "2019-05-19", "date2": 20190519, "model": "24L Transformer + 8K adaptive span", "paper": { "title": "Adaptive Attention Span in Transformers", "url": "https://cknow.io/lib/94ad0986ed638bb1" }, "paper_data_uoa": "94ad0986ed638bb1" }, { "Bit per Character (BPC)": 0.98, "Number of params": "114M", "code_links": [ { "title": "lucidrains/reformer-pytorch", "url": "https://github.com/lucidrains/reformer-pytorch" }, { "title": "facebookresearch/adaptive-span", "url": "https://github.com/facebookresearch/adaptive-span" } ], "date": "2019-07-02", "date2": 20190702, "model": "All-attention network - 36 layers", "paper": { "title": "Augmenting Self-attention with Persistent Memory", "url": "https://cknow.io/lib/bbd24632af316e45" }, "paper_data_uoa": "bbd24632af316e45" }, { "Bit per Character (BPC)": 0.99, "Number of params": "277M", "code_links": [ { "title": "huggingface/transformers", "url": "https://github.com/huggingface/transformers" }, { "title": "kimiyoung/transformer-xl", "url": "https://github.com/kimiyoung/transformer-xl" }, { "title": "benkrause/dynamiceval-transformer", "url": "https://github.com/benkrause/dynamiceval-transformer" }, { "title": "Machine-Learning-Tokyo/Poetry-GAN", "url": "https://github.com/Machine-Learning-Tokyo/Poetry-GAN" }, { "title": "inzva/fake-academic-paper-generation", "url": "https://github.com/inzva/fake-academic-paper-generation" }, { "title": "threelittlemonkeys/transformer-pytorch", "url": "https://github.com/threelittlemonkeys/transformer-pytorch" }, { "title": "cmunnis/BERT_vs_Transformer-XL", "url": "https://github.com/cmunnis/BERT_vs_Transformer-XL" }, { "title": "zhdbwe/Paper-DailyReading", "url": "https://github.com/zhdbwe/Paper-DailyReading" }, { "title": "samwisegamjeee/pytorch-transformers", "url": "https://github.com/samwisegamjeee/pytorch-transformers" }, { "title": "listenviolet/XLNet", "url": "https://github.com/listenviolet/XLNet" } ], "date": "2019-01-09", "date2": 20190109, "model": "Transformer-XL - 24 layers", "paper": { "title": "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context", "url": "https://cknow.io/lib/05ebe08ab5d3e9ea" }, "paper_data_uoa": "05ebe08ab5d3e9ea" }, { "Bit per Character (BPC)": 0.99, "Number of params": "95M", "code_links": [ { "title": "openai/sparse_attention", "url": "https://github.com/openai/sparse_attention" } ], "date": "2018-04-23", "date2": 20180423, "model": "Sparse Transformer (fixed)", "paper": { "title": "Generating Long Sequences with Sparse Transformers", "url": "https://cknow.io/lib/a158142bbf282e3a" }, "paper_data_uoa": "a158142bbf282e3a" }, { "Bit per Character (BPC)": 0.99, "Number of params": "102M", "code_links": [ { "title": "allenai/longformer", "url": "https://github.com/allenai/longformer" } ], "date": "2020-04-10", "date2": 20200410, "model": "Longformer", "paper": { "title": "Longformer: The Long-Document Transformer", "url": "https://cknow.io/lib/0f4bf27bd1b87089" }, "paper_data_uoa": "0f4bf27bd1b87089" }, { "Bit per Character (BPC)": 1.01, "Number of params": "39M", "code_links": [ { "title": "lucidrains/reformer-pytorch", "url": "https://github.com/lucidrains/reformer-pytorch" }, { "title": "facebookresearch/adaptive-span", "url": "https://github.com/facebookresearch/adaptive-span" } ], "date": "2019-07-02", "date2": 20190702, "model": "All-attention network - 18 layers", "paper": { "title": "Augmenting Self-attention with Persistent Memory", "url": "https://cknow.io/lib/bbd24632af316e45" }, "paper_data_uoa": "bbd24632af316e45" }, { "Bit per Character (BPC)": 1.02, "Number of params": "39M", "code_links": [ { "title": "facebookresearch/adaptive-span", "url": "https://github.com/facebookresearch/adaptive-span" } ], "date": "2019-05-19", "date2": 20190519, "model": "12L Transformer + 8K adaptive span", "paper": { "title": "Adaptive Attention Span in Transformers", "url": "https://cknow.io/lib/94ad0986ed638bb1" }, "paper_data_uoa": "94ad0986ed638bb1" }, { "Bit per Character (BPC)": 1.02, "code_links": [ { "title": "yzh119/BPT", "url": "https://github.com/yzh119/BPT" } ], "date": "2019-11-11", "date2": 20191111, "model": "BP-Transformer - 12 Layers", "paper": { "title": "BP-Transformer: Modelling Long-Range Context via Binary Partitioning", "url": "https://cknow.io/lib/12f384eb7746f5bc" }, "paper_data_uoa": "12f384eb7746f5bc" }, { "Bit per Character (BPC)": 1.03, "Number of params": "88M", "code_links": [ { "title": "huggingface/transformers", "url": "https://github.com/huggingface/transformers" }, { "title": "kimiyoung/transformer-xl", "url": "https://github.com/kimiyoung/transformer-xl" }, { "title": "benkrause/dynamiceval-transformer", "url": "https://github.com/benkrause/dynamiceval-transformer" }, { "title": "Machine-Learning-Tokyo/Poetry-GAN", "url": "https://github.com/Machine-Learning-Tokyo/Poetry-GAN" }, { "title": "inzva/fake-academic-paper-generation", "url": "https://github.com/inzva/fake-academic-paper-generation" }, { "title": "threelittlemonkeys/transformer-pytorch", "url": "https://github.com/threelittlemonkeys/transformer-pytorch" }, { "title": "cmunnis/BERT_vs_Transformer-XL", "url": "https://github.com/cmunnis/BERT_vs_Transformer-XL" }, { "title": "zhdbwe/Paper-DailyReading", "url": "https://github.com/zhdbwe/Paper-DailyReading" }, { "title": "samwisegamjeee/pytorch-transformers", "url": "https://github.com/samwisegamjeee/pytorch-transformers" }, { "title": "listenviolet/XLNet", "url": "https://github.com/listenviolet/XLNet" } ], "date": "2019-01-09", "date2": 20190109, "model": "Transformer-XL - 18 layers", "paper": { "title": "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context", "url": "https://cknow.io/lib/05ebe08ab5d3e9ea" }, "paper_data_uoa": "05ebe08ab5d3e9ea" }, { "Bit per Character (BPC)": 1.06, "Number of params": "235M", "code_links": [ { "title": "threelittlemonkeys/transformer-pytorch", "url": "https://github.com/threelittlemonkeys/transformer-pytorch" } ], "date": "2018-08-09", "date2": 20180809, "model": "64-layer Transformer", "paper": { "title": "Character-Level Language Modeling with Deeper Self-Attention", "url": "https://cknow.io/lib/cb9d9aa40bb96b10" }, "paper_data_uoa": "cb9d9aa40bb96b10" }, { "Bit per Character (BPC)": 1.06, "Number of params": "41M", "code_links": [ { "title": "huggingface/transformers", "url": "https://github.com/huggingface/transformers" }, { "title": "kimiyoung/transformer-xl", "url": "https://github.com/kimiyoung/transformer-xl" }, { "title": "benkrause/dynamiceval-transformer", "url": "https://github.com/benkrause/dynamiceval-transformer" }, { "title": "Machine-Learning-Tokyo/Poetry-GAN", "url": "https://github.com/Machine-Learning-Tokyo/Poetry-GAN" }, { "title": "inzva/fake-academic-paper-generation", "url": "https://github.com/inzva/fake-academic-paper-generation" }, { "title": "threelittlemonkeys/transformer-pytorch", "url": "https://github.com/threelittlemonkeys/transformer-pytorch" }, { "title": "cmunnis/BERT_vs_Transformer-XL", "url": "https://github.com/cmunnis/BERT_vs_Transformer-XL" }, { "title": "zhdbwe/Paper-DailyReading", "url": "https://github.com/zhdbwe/Paper-DailyReading" }, { "title": "samwisegamjeee/pytorch-transformers", "url": "https://github.com/samwisegamjeee/pytorch-transformers" }, { "title": "listenviolet/XLNet", "url": "https://github.com/listenviolet/XLNet" } ], "date": "2019-01-09", "date2": 20190109, "model": "Transformer-XL - 12 layers", "paper": { "title": "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context", "url": "https://cknow.io/lib/05ebe08ab5d3e9ea" }, "paper_data_uoa": "05ebe08ab5d3e9ea" }, { "Bit per Character (BPC)": 1.1, "Number of params": "52M", "code_links": [ { "title": "saattrupdan/scholarly", "url": "https://github.com/saattrupdan/scholarly" }, { "title": "alisafaya/SHA-RNN", "url": "https://github.com/alisafaya/SHA-RNN" } ], "date": "2019-11-26", "date2": 20191126, "model": "SHA-RNN", "paper": { "title": "Single Headed Attention RNN: Stop Thinking With Your Head", "url": "https://cknow.io/lib/c410c14fe4b3e839" }, "paper_data_uoa": "c410c14fe4b3e839" }, { "Bit per Character (BPC)": 1.11, "Number of params": "44M", "code_links": [ { "title": "threelittlemonkeys/transformer-pytorch", "url": "https://github.com/threelittlemonkeys/transformer-pytorch" } ], "date": "2018-08-09", "date2": 20180809, "model": "12-layer Transformer", "paper": { "title": "Character-Level Language Modeling with Deeper Self-Attention", "url": "https://cknow.io/lib/cb9d9aa40bb96b10" }, "paper_data_uoa": "cb9d9aa40bb96b10" }, { "Bit per Character (BPC)": 1.24, "Number of params": "46M", "code_links": [], "date": "2016-09-26", "date2": 20160926, "model": "Large mLSTM", "paper": { "title": "Multiplicative LSTM for sequence modelling", "url": "https://cknow.io/lib/3874ef5a3fc386e9" }, "paper_data_uoa": "3874ef5a3fc386e9" }, { "Bit per Character (BPC)": 1.25, "Number of params": "47M", "code_links": [], "date": "2017-05-24", "date2": 20170524, "model": "Large FS-LSTM-4", "paper": { "title": "Fast-Slow Recurrent Neural Networks", "url": "https://cknow.io/lib/de3ad5ae49ed54aa" }, "paper_data_uoa": "de3ad5ae49ed54aa" }, { "Bit per Character (BPC)": 1.27, "Number of params": "46M", "code_links": [ { "title": "julian121266/RecurrentHighwayNetworks", "url": "https://github.com/julian121266/RecurrentHighwayNetworks" }, { "title": "jzilly/RecurrentHighwayNetworks", "url": "https://github.com/jzilly/RecurrentHighwayNetworks" }, { "title": "davidsvaughn/dts-tf", "url": "https://github.com/davidsvaughn/dts-tf" } ], "date": "2016-07-12", "date2": 20160712, "model": "Recurrent highway networks", "paper": { "title": "Recurrent Highway Networks", "url": "https://cknow.io/lib/f8a413cba9ae8258" }, "paper_data_uoa": "f8a413cba9ae8258" }, { "Bit per Character (BPC)": 1.32, "Number of params": "35M", "code_links": [ { "title": "kaiu85/hm-rnn", "url": "https://github.com/kaiu85/hm-rnn" } ], "date": "2016-09-06", "date2": 20160906, "model": "LN HM-LSTM", "paper": { "title": "Hierarchical Multiscale Recurrent Neural Networks", "url": "https://cknow.io/lib/2d38e6e2041637ac" }, "paper_data_uoa": "2d38e6e2041637ac" }, { "Bit per Character (BPC)": 1.34, "Number of params": "27M", "code_links": [], "date": "2016-09-27", "date2": 20160927, "model": "Hypernetworks", "paper": { "title": "HyperNetworks", "url": "https://cknow.io/lib/c95db634a17a9ef3" }, "paper_data_uoa": "c95db634a17a9ef3" } ]