[
  {
    "Bit per Character (BPC)": 0.93,
    "Number of params": "1542M",
    "code_links": [
      {
        "title": "huggingface/transformers",
        "url": "https://github.com/huggingface/transformers"
      },
      {
        "title": "openai/gpt-2",
        "url": "https://github.com/openai/gpt-2"
      },
      {
        "title": "minimaxir/gpt-2-simple",
        "url": "https://github.com/minimaxir/gpt-2-simple"
      },
      {
        "title": "huggingface/swift-coreml-transformers",
        "url": "https://github.com/huggingface/swift-coreml-transformers"
      },
      {
        "title": "imcaspar/gpt2-ml",
        "url": "https://github.com/imcaspar/gpt2-ml"
      }
    ],
    "date": "2019-02-14",
    "date2": 20190214,
    "model": "GPT-2x",
    "paper": {
      "title": "Language Models are Unsupervised Multitask Learners",
      "url": "https://cknow.io/lib/2a5eff987ab8d8ec"
    },
    "paper_data_uoa": "2a5eff987ab8d8ec"
  },
  {
    "Bit per Character (BPC)": 0.94,
    "Number of params": "277M",
    "code_links": [
      {
        "title": "benkrause/dynamiceval-transformer",
        "url": "https://github.com/benkrause/dynamiceval-transformer"
      }
    ],
    "date": "2019-04-17",
    "date2": 20190417,
    "model": "Transformer-XL + RMS dynamic eval + decayx",
    "paper": {
      "title": "Dynamic Evaluation of Transformer Language Models",
      "url": "https://cknow.io/lib/a0c795a840eaba5d"
    },
    "paper_data_uoa": "a0c795a840eaba5d"
  },
  {
    "Bit per Character (BPC)": 0.96,
    "Number of params": "77M",
    "code_links": [],
    "date": "2020-02-21",
    "date2": 20200221,
    "model": "Feedback Transformer",
    "paper": {
      "title": "Accessing Higher-level Representations in Sequential Transformers with Feedback Memory",
      "url": "https://cknow.io/lib/e4bac64d03507839"
    },
    "paper_data_uoa": "e4bac64d03507839"
  },
  {
    "Bit per Character (BPC)": 0.97,
    "code_links": [],
    "date": "2020-01-01",
    "date2": 20200101,
    "model": "24L Compressive Transformer",
    "paper": {
      "title": "Compressive Transformers for Long-Range Sequence Modelling",
      "url": "https://cknow.io/lib/f74fd9acd5a777cf"
    },
    "paper_data_uoa": "f74fd9acd5a777cf"
  },
  {
    "Bit per Character (BPC)": 0.98,
    "Number of params": "209M",
    "code_links": [
      {
        "title": "facebookresearch/adaptive-span",
        "url": "https://github.com/facebookresearch/adaptive-span"
      }
    ],
    "date": "2019-05-19",
    "date2": 20190519,
    "model": "24L Transformer + 8K adaptive span",
    "paper": {
      "title": "Adaptive Attention Span in Transformers",
      "url": "https://cknow.io/lib/94ad0986ed638bb1"
    },
    "paper_data_uoa": "94ad0986ed638bb1"
  },
  {
    "Bit per Character (BPC)": 0.98,
    "Number of params": "114M",
    "code_links": [
      {
        "title": "lucidrains/reformer-pytorch",
        "url": "https://github.com/lucidrains/reformer-pytorch"
      },
      {
        "title": "facebookresearch/adaptive-span",
        "url": "https://github.com/facebookresearch/adaptive-span"
      }
    ],
    "date": "2019-07-02",
    "date2": 20190702,
    "model": "All-attention network - 36 layers",
    "paper": {
      "title": "Augmenting Self-attention with Persistent Memory",
      "url": "https://cknow.io/lib/bbd24632af316e45"
    },
    "paper_data_uoa": "bbd24632af316e45"
  },
  {
    "Bit per Character (BPC)": 0.99,
    "Number of params": "277M",
    "code_links": [
      {
        "title": "huggingface/transformers",
        "url": "https://github.com/huggingface/transformers"
      },
      {
        "title": "kimiyoung/transformer-xl",
        "url": "https://github.com/kimiyoung/transformer-xl"
      },
      {
        "title": "benkrause/dynamiceval-transformer",
        "url": "https://github.com/benkrause/dynamiceval-transformer"
      },
      {
        "title": "Machine-Learning-Tokyo/Poetry-GAN",
        "url": "https://github.com/Machine-Learning-Tokyo/Poetry-GAN"
      },
      {
        "title": "inzva/fake-academic-paper-generation",
        "url": "https://github.com/inzva/fake-academic-paper-generation"
      },
      {
        "title": "threelittlemonkeys/transformer-pytorch",
        "url": "https://github.com/threelittlemonkeys/transformer-pytorch"
      },
      {
        "title": "cmunnis/BERT_vs_Transformer-XL",
        "url": "https://github.com/cmunnis/BERT_vs_Transformer-XL"
      },
      {
        "title": "zhdbwe/Paper-DailyReading",
        "url": "https://github.com/zhdbwe/Paper-DailyReading"
      },
      {
        "title": "samwisegamjeee/pytorch-transformers",
        "url": "https://github.com/samwisegamjeee/pytorch-transformers"
      },
      {
        "title": "listenviolet/XLNet",
        "url": "https://github.com/listenviolet/XLNet"
      }
    ],
    "date": "2019-01-09",
    "date2": 20190109,
    "model": "Transformer-XL - 24 layers",
    "paper": {
      "title": "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
      "url": "https://cknow.io/lib/05ebe08ab5d3e9ea"
    },
    "paper_data_uoa": "05ebe08ab5d3e9ea"
  },
  {
    "Bit per Character (BPC)": 0.99,
    "Number of params": "95M",
    "code_links": [
      {
        "title": "openai/sparse_attention",
        "url": "https://github.com/openai/sparse_attention"
      }
    ],
    "date": "2018-04-23",
    "date2": 20180423,
    "model": "Sparse Transformer (fixed)",
    "paper": {
      "title": "Generating Long Sequences with Sparse Transformers",
      "url": "https://cknow.io/lib/a158142bbf282e3a"
    },
    "paper_data_uoa": "a158142bbf282e3a"
  },
  {
    "Bit per Character (BPC)": 0.99,
    "Number of params": "102M",
    "code_links": [
      {
        "title": "allenai/longformer",
        "url": "https://github.com/allenai/longformer"
      }
    ],
    "date": "2020-04-10",
    "date2": 20200410,
    "model": "Longformer",
    "paper": {
      "title": "Longformer: The Long-Document Transformer",
      "url": "https://cknow.io/lib/0f4bf27bd1b87089"
    },
    "paper_data_uoa": "0f4bf27bd1b87089"
  },
  {
    "Bit per Character (BPC)": 1.01,
    "Number of params": "39M",
    "code_links": [
      {
        "title": "lucidrains/reformer-pytorch",
        "url": "https://github.com/lucidrains/reformer-pytorch"
      },
      {
        "title": "facebookresearch/adaptive-span",
        "url": "https://github.com/facebookresearch/adaptive-span"
      }
    ],
    "date": "2019-07-02",
    "date2": 20190702,
    "model": "All-attention network - 18 layers",
    "paper": {
      "title": "Augmenting Self-attention with Persistent Memory",
      "url": "https://cknow.io/lib/bbd24632af316e45"
    },
    "paper_data_uoa": "bbd24632af316e45"
  },
  {
    "Bit per Character (BPC)": 1.02,
    "Number of params": "39M",
    "code_links": [
      {
        "title": "facebookresearch/adaptive-span",
        "url": "https://github.com/facebookresearch/adaptive-span"
      }
    ],
    "date": "2019-05-19",
    "date2": 20190519,
    "model": "12L Transformer + 8K adaptive span",
    "paper": {
      "title": "Adaptive Attention Span in Transformers",
      "url": "https://cknow.io/lib/94ad0986ed638bb1"
    },
    "paper_data_uoa": "94ad0986ed638bb1"
  },
  {
    "Bit per Character (BPC)": 1.02,
    "code_links": [
      {
        "title": "yzh119/BPT",
        "url": "https://github.com/yzh119/BPT"
      }
    ],
    "date": "2019-11-11",
    "date2": 20191111,
    "model": "BP-Transformer - 12 Layers",
    "paper": {
      "title": "BP-Transformer: Modelling Long-Range Context via Binary Partitioning",
      "url": "https://cknow.io/lib/12f384eb7746f5bc"
    },
    "paper_data_uoa": "12f384eb7746f5bc"
  },
  {
    "Bit per Character (BPC)": 1.03,
    "Number of params": "88M",
    "code_links": [
      {
        "title": "huggingface/transformers",
        "url": "https://github.com/huggingface/transformers"
      },
      {
        "title": "kimiyoung/transformer-xl",
        "url": "https://github.com/kimiyoung/transformer-xl"
      },
      {
        "title": "benkrause/dynamiceval-transformer",
        "url": "https://github.com/benkrause/dynamiceval-transformer"
      },
      {
        "title": "Machine-Learning-Tokyo/Poetry-GAN",
        "url": "https://github.com/Machine-Learning-Tokyo/Poetry-GAN"
      },
      {
        "title": "inzva/fake-academic-paper-generation",
        "url": "https://github.com/inzva/fake-academic-paper-generation"
      },
      {
        "title": "threelittlemonkeys/transformer-pytorch",
        "url": "https://github.com/threelittlemonkeys/transformer-pytorch"
      },
      {
        "title": "cmunnis/BERT_vs_Transformer-XL",
        "url": "https://github.com/cmunnis/BERT_vs_Transformer-XL"
      },
      {
        "title": "zhdbwe/Paper-DailyReading",
        "url": "https://github.com/zhdbwe/Paper-DailyReading"
      },
      {
        "title": "samwisegamjeee/pytorch-transformers",
        "url": "https://github.com/samwisegamjeee/pytorch-transformers"
      },
      {
        "title": "listenviolet/XLNet",
        "url": "https://github.com/listenviolet/XLNet"
      }
    ],
    "date": "2019-01-09",
    "date2": 20190109,
    "model": "Transformer-XL - 18 layers",
    "paper": {
      "title": "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
      "url": "https://cknow.io/lib/05ebe08ab5d3e9ea"
    },
    "paper_data_uoa": "05ebe08ab5d3e9ea"
  },
  {
    "Bit per Character (BPC)": 1.06,
    "Number of params": "235M",
    "code_links": [
      {
        "title": "threelittlemonkeys/transformer-pytorch",
        "url": "https://github.com/threelittlemonkeys/transformer-pytorch"
      }
    ],
    "date": "2018-08-09",
    "date2": 20180809,
    "model": "64-layer Transformer",
    "paper": {
      "title": "Character-Level Language Modeling with Deeper Self-Attention",
      "url": "https://cknow.io/lib/cb9d9aa40bb96b10"
    },
    "paper_data_uoa": "cb9d9aa40bb96b10"
  },
  {
    "Bit per Character (BPC)": 1.06,
    "Number of params": "41M",
    "code_links": [
      {
        "title": "huggingface/transformers",
        "url": "https://github.com/huggingface/transformers"
      },
      {
        "title": "kimiyoung/transformer-xl",
        "url": "https://github.com/kimiyoung/transformer-xl"
      },
      {
        "title": "benkrause/dynamiceval-transformer",
        "url": "https://github.com/benkrause/dynamiceval-transformer"
      },
      {
        "title": "Machine-Learning-Tokyo/Poetry-GAN",
        "url": "https://github.com/Machine-Learning-Tokyo/Poetry-GAN"
      },
      {
        "title": "inzva/fake-academic-paper-generation",
        "url": "https://github.com/inzva/fake-academic-paper-generation"
      },
      {
        "title": "threelittlemonkeys/transformer-pytorch",
        "url": "https://github.com/threelittlemonkeys/transformer-pytorch"
      },
      {
        "title": "cmunnis/BERT_vs_Transformer-XL",
        "url": "https://github.com/cmunnis/BERT_vs_Transformer-XL"
      },
      {
        "title": "zhdbwe/Paper-DailyReading",
        "url": "https://github.com/zhdbwe/Paper-DailyReading"
      },
      {
        "title": "samwisegamjeee/pytorch-transformers",
        "url": "https://github.com/samwisegamjeee/pytorch-transformers"
      },
      {
        "title": "listenviolet/XLNet",
        "url": "https://github.com/listenviolet/XLNet"
      }
    ],
    "date": "2019-01-09",
    "date2": 20190109,
    "model": "Transformer-XL - 12 layers",
    "paper": {
      "title": "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
      "url": "https://cknow.io/lib/05ebe08ab5d3e9ea"
    },
    "paper_data_uoa": "05ebe08ab5d3e9ea"
  },
  {
    "Bit per Character (BPC)": 1.1,
    "Number of params": "52M",
    "code_links": [
      {
        "title": "saattrupdan/scholarly",
        "url": "https://github.com/saattrupdan/scholarly"
      },
      {
        "title": "alisafaya/SHA-RNN",
        "url": "https://github.com/alisafaya/SHA-RNN"
      }
    ],
    "date": "2019-11-26",
    "date2": 20191126,
    "model": "SHA-RNN",
    "paper": {
      "title": "Single Headed Attention RNN: Stop Thinking With Your Head",
      "url": "https://cknow.io/lib/c410c14fe4b3e839"
    },
    "paper_data_uoa": "c410c14fe4b3e839"
  },
  {
    "Bit per Character (BPC)": 1.11,
    "Number of params": "44M",
    "code_links": [
      {
        "title": "threelittlemonkeys/transformer-pytorch",
        "url": "https://github.com/threelittlemonkeys/transformer-pytorch"
      }
    ],
    "date": "2018-08-09",
    "date2": 20180809,
    "model": "12-layer Transformer",
    "paper": {
      "title": "Character-Level Language Modeling with Deeper Self-Attention",
      "url": "https://cknow.io/lib/cb9d9aa40bb96b10"
    },
    "paper_data_uoa": "cb9d9aa40bb96b10"
  },
  {
    "Bit per Character (BPC)": 1.24,
    "Number of params": "46M",
    "code_links": [],
    "date": "2016-09-26",
    "date2": 20160926,
    "model": "Large mLSTM",
    "paper": {
      "title": "Multiplicative LSTM for sequence modelling",
      "url": "https://cknow.io/lib/3874ef5a3fc386e9"
    },
    "paper_data_uoa": "3874ef5a3fc386e9"
  },
  {
    "Bit per Character (BPC)": 1.25,
    "Number of params": "47M",
    "code_links": [],
    "date": "2017-05-24",
    "date2": 20170524,
    "model": "Large FS-LSTM-4",
    "paper": {
      "title": "Fast-Slow Recurrent Neural Networks",
      "url": "https://cknow.io/lib/de3ad5ae49ed54aa"
    },
    "paper_data_uoa": "de3ad5ae49ed54aa"
  },
  {
    "Bit per Character (BPC)": 1.27,
    "Number of params": "46M",
    "code_links": [
      {
        "title": "julian121266/RecurrentHighwayNetworks",
        "url": "https://github.com/julian121266/RecurrentHighwayNetworks"
      },
      {
        "title": "jzilly/RecurrentHighwayNetworks",
        "url": "https://github.com/jzilly/RecurrentHighwayNetworks"
      },
      {
        "title": "davidsvaughn/dts-tf",
        "url": "https://github.com/davidsvaughn/dts-tf"
      }
    ],
    "date": "2016-07-12",
    "date2": 20160712,
    "model": "Recurrent highway networks",
    "paper": {
      "title": "Recurrent Highway Networks",
      "url": "https://cknow.io/lib/f8a413cba9ae8258"
    },
    "paper_data_uoa": "f8a413cba9ae8258"
  },
  {
    "Bit per Character (BPC)": 1.32,
    "Number of params": "35M",
    "code_links": [
      {
        "title": "kaiu85/hm-rnn",
        "url": "https://github.com/kaiu85/hm-rnn"
      }
    ],
    "date": "2016-09-06",
    "date2": 20160906,
    "model": "LN HM-LSTM",
    "paper": {
      "title": "Hierarchical Multiscale Recurrent Neural Networks",
      "url": "https://cknow.io/lib/2d38e6e2041637ac"
    },
    "paper_data_uoa": "2d38e6e2041637ac"
  },
  {
    "Bit per Character (BPC)": 1.34,
    "Number of params": "27M",
    "code_links": [],
    "date": "2016-09-27",
    "date2": 20160927,
    "model": "Hypernetworks",
    "paper": {
      "title": "HyperNetworks",
      "url": "https://cknow.io/lib/c95db634a17a9ef3"
    },
    "paper_data_uoa": "c95db634a17a9ef3"
  }
]