[ { "Top-1 Accuracy": 66.6, "Top-5 Accuracy": 91.3, "code_links": [ { "title": "MIT-HAN-LAB/temporal-shift-module", "url": "https://github.com/MIT-HAN-LAB/temporal-shift-module" }, { "title": "PaParaZz1/TemporalShiftModule", "url": "https://github.com/PaParaZz1/TemporalShiftModule" }, { "title": "WavesUR/embedded_TSM", "url": "https://github.com/WavesUR/embedded_TSM" }, { "title": "niveditarahurkar/CS231N-ActionRecognition", "url": "https://github.com/niveditarahurkar/CS231N-ActionRecognition" } ], "date": "2018-11-20", "date2": 20181120, "model": "TSM (RGB + Flow)", "paper": { "title": "TSM: Temporal Shift Module for Efficient Video Understanding", "url": "https://cknow.io/lib/0443f1df43248fc9" }, "paper_data_uoa": "0443f1df43248fc9" }, { "Top-1 Accuracy": 64.2, "Top-5 Accuracy": 89.8, "code_links": [], "date": "2019-08-07", "date2": 20190807, "model": "STM (16 frames, ImageNet pretraining)", "paper": { "title": "STM: SpatioTemporal and Motion Encoding for Action Recognition", "url": "https://cknow.io/lib/dbf65072610598a3" }, "paper_data_uoa": "dbf65072610598a3" }, { "Top-1 Accuracy": 62.2, "Top-5 Accuracy": 90.3, "code_links": [], "date": "2019-08-27", "date2": 20190827, "model": "TRG-ResNet-50", "paper": { "title": "Temporal Reasoning Graph for Activity Recognition", "url": "https://cknow.io/lib/48eac55d1d0207b6" }, "paper_data_uoa": "48eac55d1d0207b6" }, { "Top-1 Accuracy": 62, "code_links": [], "date": "2020-04-07", "date2": 20200407, "model": "TPN (TSM-50)", "paper": { "title": "Temporal Pyramid Network for Action Recognition", "url": "https://cknow.io/lib/f3df98b54f0c923e" }, "paper_data_uoa": "f3df98b54f0c923e" }, { "Top-1 Accuracy": 61.3, "Top-5 Accuracy": 91.4, "code_links": [], "date": "2019-08-27", "date2": 20190827, "model": "TRG-Inception-V3", "paper": { "title": "Temporal Reasoning Graph for Activity Recognition", "url": "https://cknow.io/lib/48eac55d1d0207b6" }, "paper_data_uoa": "48eac55d1d0207b6" }, { "Top-1 Accuracy": 61.2, "Top-5 Accuracy": 89.3, "code_links": [], "date": "2019-08-27", "date2": 20190827, "model": "CCS + two-stream + TRN", "paper": { "title": "Cooperative Cross-Stream Network for Discriminative Action Representation", "url": "https://cknow.io/lib/09a74f888e9817da" }, "paper_data_uoa": "09a74f888e9817da" }, { "Top-1 Accuracy": 57.65, "Top-5 Accuracy": 83.95, "code_links": [ { "title": "xingyul/cpnet", "url": "https://github.com/xingyul/cpnet" }, { "title": "xingyul/meteornet", "url": "https://github.com/xingyul/meteornet" } ], "date": "2019-05-20", "date2": 20190520, "model": "CPNet Res34, 5 CP", "paper": { "title": "Learning Video Representations from Correspondence Proposals", "url": "https://cknow.io/lib/9e742789227dfb82" }, "paper_data_uoa": "9e742789227dfb82" }, { "Top-1 Accuracy": 55.52, "Top-5 Accuracy": 83.06, "code_links": [ { "title": "metalbubble/TRN-pytorch", "url": "https://github.com/metalbubble/TRN-pytorch" }, { "title": "okankop/MFF-pytorch", "url": "https://github.com/okankop/MFF-pytorch" } ], "date": "2017-11-22", "date2": 20171122, "model": "2-Stream TRN", "paper": { "title": "Temporal Relational Reasoning in Videos", "url": "https://cknow.io/lib/ab89498599fa31b3" }, "paper_data_uoa": "ab89498599fa31b3" }, { "Top-1 Accuracy": 52.3, "code_links": [], "date": "2019-06-27", "date2": 20190627, "model": "TAM (5-shot)", "paper": { "title": "Few-Shot Video Classification via Temporal Alignment", "url": "https://cknow.io/lib/46c481c9757c6ff1" }, "paper_data_uoa": "46c481c9757c6ff1" }, { "Top-1 Accuracy": 51.33, "Top-5 Accuracy": 80.46, "code_links": [ { "title": "TwentyBN/smth-smth-v2-baseline-with-models", "url": "https://github.com/TwentyBN/smth-smth-v2-baseline-with-models" }, { "title": "TwentyBN/something-something-v2-baseline", "url": "https://github.com/TwentyBN/something-something-v2-baseline" }, { "title": "caspillaga/Conv3DSelfAttention", "url": "https://github.com/caspillaga/Conv3DSelfAttention" } ], "date": "2017-06-13", "date2": 20170613, "model": "model3D_1 with left-right augmentation and fps jitter", "paper": { "title": "The \"something something\" video database for learning and evaluating visual common sense", "url": "https://cknow.io/lib/25d6c0d541fdb268" }, "paper_data_uoa": "25d6c0d541fdb268" }, { "Top-1 Accuracy": 49.9, "Top-5 Accuracy": 79.1, "code_links": [], "date": "2019-04-05", "date2": 20190405, "model": "Prob-Distill", "paper": { "title": "Paying More Attention to Motion: Attention Distillation for Learning Video Representations", "url": "https://cknow.io/lib/b222433297a9a5c3" }, "paper_data_uoa": "b222433297a9a5c3" }, { "Top-1 Accuracy": 47.73, "code_links": [ { "title": "fubel/stmodeling", "url": "https://github.com/fubel/stmodeling" } ], "date": "2019-09-11", "date2": 20190911, "model": "STM + TRNMultiscale", "paper": { "title": "Comparative Analysis of CNN-based Spatiotemporal Reasoning in Videos", "url": "https://cknow.io/lib/fbe3c24794e9cae7" }, "paper_data_uoa": "fbe3c24794e9cae7" } ]