[
  {
    "50.00 percentile latency (ns)": 4017351,
    "90.00 percentile latency (ns)": 7347266,
    "95.00 percentile latency (ns)": 8381065,
    "97.00 percentile latency (ns)": 8943585,
    "99.00 percentile latency (ns)": 9793473,
    "99.90 percentile latency (ns)": 11452192,
    "Completed samples per second": 126530.75,
    "Max latency (ns)": 19683731,
    "Mean latency (ns)": 4311500,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 239433,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126531.36,
    "accelerator_cooling_type": "passive",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "1.1.3",
    "characteristics.power": 835.5230000000006,
    "characteristics.power.normalized_per_core": 208.88075000000015,
    "characteristics.power.normalized_per_processor": 208.88075000000015,
    "characteristics.scheduled_queries_per_second": 126531.36,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31632.84,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31632.84,
    "ck_system": "XE2420_datacenter_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "filesystem": "ext3/ext4",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "passive",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "4 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "management_firmware_version": "4.40.10.00",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE2420_datacenter_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "power_settings": "closed/DellEMC/measurements/XE2420_datacenter_T4x4_TRT/XE2420_T4x4_power_settings.adoc",
    "print_timestamps": 0,
    "problem": false,
    "psu_details": "2x2000W",
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_cooling_type": "air",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_datacenter_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126550,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "24f561fe894008a6",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 3986754,
    "90.00 percentile latency (ns)": 7313540,
    "95.00 percentile latency (ns)": 8363555,
    "97.00 percentile latency (ns)": 8932407,
    "99.00 percentile latency (ns)": 9782400,
    "99.90 percentile latency (ns)": 11334965,
    "Completed samples per second": 126530.71,
    "Max latency (ns)": 19464765,
    "Mean latency (ns)": 4286588,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 233929,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126531.36,
    "accelerator_cooling_type": "passive",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "1.1.3",
    "characteristics.power": 835.9083333333339,
    "characteristics.power.normalized_per_core": 208.97708333333347,
    "characteristics.power.normalized_per_processor": 208.97708333333347,
    "characteristics.scheduled_queries_per_second": 126531.36,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31632.84,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31632.84,
    "ck_system": "XE2420_datacenter_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "filesystem": "ext3/ext4",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "passive",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "4 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "management_firmware_version": "4.40.10.00",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE2420_datacenter_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "power_settings": "closed/DellEMC/measurements/XE2420_datacenter_T4x4_TRT/XE2420_T4x4_power_settings.adoc",
    "print_timestamps": 0,
    "problem": false,
    "psu_details": "2x2000W",
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_cooling_type": "air",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_datacenter_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126550,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "65532728421b45c5",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 1797214,
    "90.00 percentile latency (ns)": 4463735,
    "95.00 percentile latency (ns)": 6166069,
    "97.00 percentile latency (ns)": 7924552,
    "99.00 percentile latency (ns)": 12896851,
    "99.90 percentile latency (ns)": 24772282,
    "Completed samples per second": 620116.87,
    "Max latency (ns)": 68669645,
    "Mean latency (ns)": 2480235,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 181476,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 620118.17,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA A40",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 620118.17,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 62011.817,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 62011.817,
    "ck_system": "DSS8440_A40x10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A40x10_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A40x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 620000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "bea8c144de9abc88",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 1797214,
    "90.00 percentile latency (ns)": 4463735,
    "95.00 percentile latency (ns)": 6166069,
    "97.00 percentile latency (ns)": 7924552,
    "99.00 percentile latency (ns)": 12896851,
    "99.90 percentile latency (ns)": 24772282,
    "Completed samples per second": 620116.87,
    "Max latency (ns)": 68669645,
    "Mean latency (ns)": 2480235,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 181476,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 620118.17,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA A40",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 620118.17,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 62011.817,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 62011.817,
    "ck_system": "DSS8440_A40x10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A40x10_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A40x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 620000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "58cfa468c94d0deb",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 1707509,
    "90.00 percentile latency (ns)": 2896942,
    "95.00 percentile latency (ns)": 3338644,
    "97.00 percentile latency (ns)": 3652513,
    "99.00 percentile latency (ns)": 4772308,
    "99.90 percentile latency (ns)": 285875300,
    "Completed samples per second": 400020.59,
    "Max latency (ns)": 372703976,
    "Mean latency (ns)": 2658032,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 132351,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 400021.0,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 400021.0,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 133340.33333333334,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 133340.33333333334,
    "ck_system": "R740_A100-PCIe-40GBx3_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R740_A100-PCIe-40GBx3_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_A100-PCIe-40GBx3_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 400000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "b96292d69975b099",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 1707509,
    "90.00 percentile latency (ns)": 2896942,
    "95.00 percentile latency (ns)": 3338644,
    "97.00 percentile latency (ns)": 3652513,
    "99.00 percentile latency (ns)": 4772308,
    "99.90 percentile latency (ns)": 285875300,
    "Completed samples per second": 400020.59,
    "Max latency (ns)": 372703976,
    "Mean latency (ns)": 2658032,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 132351,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 400021.0,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 400021.0,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 133340.33333333334,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 133340.33333333334,
    "ck_system": "R740_A100-PCIe-40GBx3_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R740_A100-PCIe-40GBx3_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_A100-PCIe-40GBx3_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 400000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "fdd567ff42c44d86",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5389936,
    "90.00 percentile latency (ns)": 12019440,
    "95.00 percentile latency (ns)": 14553985,
    "97.00 percentile latency (ns)": 16000205,
    "99.00 percentile latency (ns)": 18346214,
    "99.90 percentile latency (ns)": 26713562,
    "Completed samples per second": 223029.98,
    "Max latency (ns)": 48682446,
    "Mean latency (ns)": 6306270,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 198999,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 223032.29,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 223032.29,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 74344.09666666666,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 74344.09666666666,
    "ck_system": "R7525_QuadroRTX8000x3_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 223050,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "00998025edc4e919",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5389936,
    "90.00 percentile latency (ns)": 12019440,
    "95.00 percentile latency (ns)": 14553985,
    "97.00 percentile latency (ns)": 16000205,
    "99.00 percentile latency (ns)": 18346214,
    "99.90 percentile latency (ns)": 26713562,
    "Completed samples per second": 223029.98,
    "Max latency (ns)": 48682446,
    "Mean latency (ns)": 6306270,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 198999,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 223032.29,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 223032.29,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 74344.09666666666,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 74344.09666666666,
    "ck_system": "R7525_QuadroRTX8000x3_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 204800,
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 223050,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "c2d81448492d146d",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 4240920,
    "90.00 percentile latency (ns)": 7972083,
    "95.00 percentile latency (ns)": 8883645,
    "97.00 percentile latency (ns)": 10116231,
    "99.00 percentile latency (ns)": 28920661,
    "99.90 percentile latency (ns)": 66610984,
    "Completed samples per second": 260050.68,
    "Max latency (ns)": 73151205,
    "Mean latency (ns)": 5003988,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 226151,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 260062.59,
    "accelerator_frequency": "1590MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "4 Accelerators per CPU",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Tesla T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 8,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 260062.59,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 32507.82375,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 32507.82375,
    "ck_system": "R7525_T4x8_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "8x64GB DDR4-3200 HMAA8GR7AJR4N-XN RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "2MB+16MB+128MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.35GHz",
    "host_processor_interconnect": "Infinity Fabric",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB (5x800GB WUSTR6480ASS200 in RAID5)",
    "host_storage_type": "3D-TLC Solid State with 12Gbps SAS",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 8,
    "normalize_processors": 8,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_T4x8_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_T4x8_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 260000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "b38925f9e28c4eda",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 4240920,
    "90.00 percentile latency (ns)": 7972083,
    "95.00 percentile latency (ns)": 8883645,
    "97.00 percentile latency (ns)": 10116231,
    "99.00 percentile latency (ns)": 28920661,
    "99.90 percentile latency (ns)": 66610984,
    "Completed samples per second": 260050.68,
    "Max latency (ns)": 73151205,
    "Mean latency (ns)": 5003988,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 226151,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 260062.59,
    "accelerator_frequency": "1590MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "4 Accelerators per CPU",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Tesla T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 8,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 260062.59,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 32507.82375,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 32507.82375,
    "ck_system": "R7525_T4x8_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "8x64GB DDR4-3200 HMAA8GR7AJR4N-XN RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "2MB+16MB+128MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.35GHz",
    "host_processor_interconnect": "Infinity Fabric",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB (5x800GB WUSTR6480ASS200 in RAID5)",
    "host_storage_type": "3D-TLC Solid State with 12Gbps SAS",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 8,
    "normalize_processors": 8,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_T4x8_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_T4x8_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 260000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "293a442caaa0f795",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 2814572,
    "90.00 percentile latency (ns)": 5197274,
    "95.00 percentile latency (ns)": 6178533,
    "97.00 percentile latency (ns)": 7093669,
    "99.00 percentile latency (ns)": 16479284,
    "99.90 percentile latency (ns)": 52008458,
    "Completed samples per second": 126507.64,
    "Max latency (ns)": 62099322,
    "Mean latency (ns)": 3405626,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 256791,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126513.98,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 126513.98,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31628.495,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31628.495,
    "ck_system": "XE2420_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/XE2420_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126500,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "c2a5bbdf8da6b04a",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 2814572,
    "90.00 percentile latency (ns)": 5197274,
    "95.00 percentile latency (ns)": 6178533,
    "97.00 percentile latency (ns)": 7093669,
    "99.00 percentile latency (ns)": 16479284,
    "99.90 percentile latency (ns)": 52008458,
    "Completed samples per second": 126507.64,
    "Max latency (ns)": 62099322,
    "Mean latency (ns)": 3405626,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 256791,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126513.98,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 126513.98,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31628.495,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31628.495,
    "ck_system": "XE2420_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/XE2420_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126500,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "f4592318e9013b58",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 9551549,
    "90.00 percentile latency (ns)": 12614111,
    "95.00 percentile latency (ns)": 13580069,
    "97.00 percentile latency (ns)": 14265206,
    "99.00 percentile latency (ns)": 16631074,
    "99.90 percentile latency (ns)": 94473376,
    "Completed samples per second": 385035.2,
    "Max latency (ns)": 119675324,
    "Mean latency (ns)": 9917343,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 200576,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 385085.3,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIE-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 2,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 385085.3,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 192542.65,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 192542.65,
    "ck_system": "R7525_A100x2_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 2,
    "normalize_processors": 2,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x2_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x2_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 385000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "7911a71a701cd29b",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 9551549,
    "90.00 percentile latency (ns)": 12614111,
    "95.00 percentile latency (ns)": 13580069,
    "97.00 percentile latency (ns)": 14265206,
    "99.00 percentile latency (ns)": 16631074,
    "99.90 percentile latency (ns)": 94473376,
    "Completed samples per second": 385035.2,
    "Max latency (ns)": 119675324,
    "Mean latency (ns)": 9917343,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 200576,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 385085.3,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIE-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 2,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 385085.3,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 192542.65,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 192542.65,
    "ck_system": "R7525_A100x2_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 2,
    "normalize_processors": 2,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x2_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x2_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 385000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "f446fc98027e3952",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5846737,
    "90.00 percentile latency (ns)": 15135023,
    "95.00 percentile latency (ns)": 18574822,
    "97.00 percentile latency (ns)": 20089548,
    "99.00 percentile latency (ns)": 22222354,
    "99.90 percentile latency (ns)": 25561646,
    "Completed samples per second": 777727.18,
    "Max latency (ns)": 38585467,
    "Mean latency (ns)": 7272711,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 232263,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 777955.77,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 777955.77,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 77795.577,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 77795.577,
    "ck_system": "DSS8440_QuadroRTX8000x10_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x10_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 777500,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "9569341809883054",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5846737,
    "90.00 percentile latency (ns)": 15135023,
    "95.00 percentile latency (ns)": 18574822,
    "97.00 percentile latency (ns)": 20089548,
    "99.00 percentile latency (ns)": 22222354,
    "99.90 percentile latency (ns)": 25561646,
    "Completed samples per second": 777727.18,
    "Max latency (ns)": 38585467,
    "Mean latency (ns)": 7272711,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 232263,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 777955.77,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 777955.77,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 77795.577,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 77795.577,
    "ck_system": "DSS8440_QuadroRTX8000x10_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x10_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 777500,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "cbe10681684bbb8c",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5668100,
    "90.00 percentile latency (ns)": 8099003,
    "95.00 percentile latency (ns)": 10456538,
    "97.00 percentile latency (ns)": 12384978,
    "99.00 percentile latency (ns)": 26580794,
    "99.90 percentile latency (ns)": 63160889,
    "Completed samples per second": 503144.5,
    "Max latency (ns)": 72154623,
    "Mean latency (ns)": 5614811,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 218992,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 503200.26,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 16,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 503200.26,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31450.01625,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31450.01625,
    "ck_system": "DSS8440_T4x16_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 16,
    "normalize_processors": 16,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x16_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x16_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 503000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "c0d4fc78d5998bd6",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5668100,
    "90.00 percentile latency (ns)": 8099003,
    "95.00 percentile latency (ns)": 10456538,
    "97.00 percentile latency (ns)": 12384978,
    "99.00 percentile latency (ns)": 26580794,
    "99.90 percentile latency (ns)": 63160889,
    "Completed samples per second": 503144.5,
    "Max latency (ns)": 72154623,
    "Mean latency (ns)": 5614811,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 218992,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 503200.26,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 16,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 503200.26,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31450.01625,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31450.01625,
    "ck_system": "DSS8440_T4x16_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 16,
    "normalize_processors": 16,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x16_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x16_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 503000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "d9f4f2b9f4ac791a",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 3163131,
    "90.00 percentile latency (ns)": 6876396,
    "95.00 percentile latency (ns)": 9403464,
    "97.00 percentile latency (ns)": 12380003,
    "99.00 percentile latency (ns)": 25680651,
    "99.90 percentile latency (ns)": 59762371,
    "Completed samples per second": 126485.42,
    "Max latency (ns)": 67917198,
    "Mean latency (ns)": 4244183,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 303135,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126513.98,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 126513.98,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31628.495,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31628.495,
    "ck_system": "R7515_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "256 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 1,
    "host_storage_capacity": "3.2 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7515_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.27, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7515_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126500,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 120,
    "uid": "a9dfd06495ab53c2",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 3163131,
    "90.00 percentile latency (ns)": 6876396,
    "95.00 percentile latency (ns)": 9403464,
    "97.00 percentile latency (ns)": 12380003,
    "99.00 percentile latency (ns)": 25680651,
    "99.90 percentile latency (ns)": 59762371,
    "Completed samples per second": 126485.42,
    "Max latency (ns)": 67917198,
    "Mean latency (ns)": 4244183,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 303135,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126513.98,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 126513.98,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31628.495,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31628.495,
    "ck_system": "R7515_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "256 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 1,
    "host_storage_capacity": "3.2 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7515_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.27, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7515_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126500,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 120,
    "uid": "3e42da98b51dd5aa",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5712177,
    "90.00 percentile latency (ns)": 8589191,
    "95.00 percentile latency (ns)": 10499747,
    "97.00 percentile latency (ns)": 12197523,
    "99.00 percentile latency (ns)": 25316038,
    "99.90 percentile latency (ns)": 68447230,
    "Completed samples per second": 380074.17,
    "Max latency (ns)": 78639940,
    "Mean latency (ns)": 6000388,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 253713,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 380082.59,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 12,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 380082.59,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31673.549166666668,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31673.549166666668,
    "ck_system": "DSS8440_T4x12_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 12,
    "normalize_processors": 12,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x12_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x12_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 380000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "b6cd73460cb538e5",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 5712177,
    "90.00 percentile latency (ns)": 8589191,
    "95.00 percentile latency (ns)": 10499747,
    "97.00 percentile latency (ns)": 12197523,
    "99.00 percentile latency (ns)": 25316038,
    "99.90 percentile latency (ns)": 68447230,
    "Completed samples per second": 380074.17,
    "Max latency (ns)": 78639940,
    "Mean latency (ns)": 6000388,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 253713,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 380082.59,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 12,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 380082.59,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31673.549166666668,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31673.549166666668,
    "ck_system": "DSS8440_T4x12_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 12,
    "normalize_processors": 12,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x12_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x12_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 380000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "b10d50148699e80e",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 10950629,
    "90.00 percentile latency (ns)": 15720559,
    "95.00 percentile latency (ns)": 17058870,
    "97.00 percentile latency (ns)": 18825926,
    "99.00 percentile latency (ns)": 26909301,
    "99.90 percentile latency (ns)": 39947352,
    "Completed samples per second": 279344.3,
    "Max latency (ns)": 84518064,
    "Mean latency (ns)": 10510213,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 237744,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 279363.86,
    "accelerator_frequency": "1770MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "2 Accelerators per CPU",
    "accelerator_memory_capacity": "24 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 6000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 279363.86,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 69840.965,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 69840.965,
    "ck_system": "C4140_QuadroRTX6000x4_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "6x16GB DDR4-2666 HMA82GR7AFR8N-VK RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "1.25MB+20MB+27.5MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.40GHz",
    "host_processor_interconnect": "Ultra Path Interconnect",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.6 TB (1x1.6TB Dell Express Flash PM1725a 1.6TB AIC)",
    "host_storage_type": "3D-TLC Solid State with PCIe NVME x8 Interface",
    "hw_notes": "ECC off. RTX6000 is available as a special config thru Dell DSS or OEM for PowerEdge C4140",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/C4140_QuadroRTX6000x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/C4140_QuadroRTX6000x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 279300,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "ff24708917a538b5",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 10950629,
    "90.00 percentile latency (ns)": 15720559,
    "95.00 percentile latency (ns)": 17058870,
    "97.00 percentile latency (ns)": 18825926,
    "99.00 percentile latency (ns)": 26909301,
    "99.90 percentile latency (ns)": 39947352,
    "Completed samples per second": 279344.3,
    "Max latency (ns)": 84518064,
    "Mean latency (ns)": 10510213,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 237744,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 279363.86,
    "accelerator_frequency": "1770MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "2 Accelerators per CPU",
    "accelerator_memory_capacity": "24 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 6000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 279363.86,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 69840.965,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 69840.965,
    "ck_system": "C4140_QuadroRTX6000x4_TRT",
    "ck_used": true,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "6x16GB DDR4-2666 HMA82GR7AFR8N-VK RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "1.25MB+20MB+27.5MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.40GHz",
    "host_processor_interconnect": "Ultra Path Interconnect",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.6 TB (1x1.6TB Dell Express Flash PM1725a 1.6TB AIC)",
    "host_storage_type": "3D-TLC Solid State with PCIe NVME x8 Interface",
    "hw_notes": "ECC off. RTX6000 is available as a special config thru Dell DSS or OEM for PowerEdge C4140",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/C4140_QuadroRTX6000x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/C4140_QuadroRTX6000x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 279300,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "9499a917c27cef95",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 9595613,
    "90.00 percentile latency (ns)": 16201583,
    "95.00 percentile latency (ns)": 17373119,
    "97.00 percentile latency (ns)": 18098219,
    "99.00 percentile latency (ns)": 20427349,
    "99.90 percentile latency (ns)": 31042959,
    "Completed samples per second": 233058.29,
    "Max latency (ns)": 49330232,
    "Mean latency (ns)": 9869348,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 178184,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 233102.78,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 233102.78,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 77700.92666666667,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 77700.92666666667,
    "ck_system": "R7525_QuadroRTX8000x3_TRT",
    "ck_used": false,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 32,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7502",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "CentOS Linux release 8.1.1911",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT",
    "system_name": "Dell EMC PowerEdge R7525 (3x Quadro RTX 8000)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 233050,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 64,
    "uid": "396f92996c0f85d5",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 9595613,
    "90.00 percentile latency (ns)": 16201583,
    "95.00 percentile latency (ns)": 17373119,
    "97.00 percentile latency (ns)": 18098219,
    "99.00 percentile latency (ns)": 20427349,
    "99.90 percentile latency (ns)": 31042959,
    "Completed samples per second": 233058.29,
    "Max latency (ns)": 49330232,
    "Mean latency (ns)": 9869348,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 178184,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 233102.78,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 233102.78,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 77700.92666666667,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 77700.92666666667,
    "ck_system": "R7525_QuadroRTX8000x3_TRT",
    "ck_used": false,
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 32,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7502",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "CentOS Linux release 8.1.1911",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT",
    "system_name": "Dell EMC PowerEdge R7525 (3x Quadro RTX 8000)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 233050,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 64,
    "uid": "9764e1389bd95955",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 3137321,
    "90.00 percentile latency (ns)": 6215910,
    "95.00 percentile latency (ns)": 7602037,
    "97.00 percentile latency (ns)": 8701981,
    "99.00 percentile latency (ns)": 20204875,
    "99.90 percentile latency (ns)": 56842193,
    "Completed samples per second": 126010.92,
    "Max latency (ns)": 65263329,
    "Mean latency (ns)": 3888135,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 260940,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126014.85,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 126014.85,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31503.7125,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31503.7125,
    "ck_system": "R740_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "3.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99.9",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R740_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.207, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "4056805dcf1f61c8",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 3137321,
    "90.00 percentile latency (ns)": 6215910,
    "95.00 percentile latency (ns)": 7602037,
    "97.00 percentile latency (ns)": 8701981,
    "99.00 percentile latency (ns)": 20204875,
    "99.90 percentile latency (ns)": 56842193,
    "Completed samples per second": 126010.92,
    "Max latency (ns)": 65263329,
    "Mean latency (ns)": 3888135,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 260940,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "DLRM SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 126014.85,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 126014.85,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 31503.7125,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 31503.7125,
    "ck_system": "R740_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "1TB Click Logs",
    "dataset_link": "",
    "dim_x_default": "characteristics.scheduled_queries_per_second",
    "dim_y_default": "characteristics.AUC",
    "dim_y_maximize": true,
    "division": "closed",
    "formal_model": "dlrm",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "3.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "SSD",
    "hw_notes": "ECC on",
    "informal_model": "dlrm-99",
    "input_data_types": "int8",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R740_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.207, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 204800,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "tb00_40M.pt",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 30000000,
    "target_qps": 126000,
    "task": "recommendation",
    "task2": "recommendation",
    "total_cores": 240,
    "uid": "bfcd9573832434b8",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  }
]