[
  {
    "50.00 percentile latency (ns)": 104945172,
    "90.00 percentile latency (ns)": 119919390,
    "95.00 percentile latency (ns)": 122695906,
    "97.00 percentile latency (ns)": 124479543,
    "99.00 percentile latency (ns)": 127732996,
    "99.90 percentile latency (ns)": 132897344,
    "Completed samples per second": 26745.53,
    "Max latency (ns)": 142367860,
    "Mean latency (ns)": 101919188,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 3376343,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 26748.65,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 26748.65,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 2674.8650000000002,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 2674.8650000000002,
    "ck_system": "DSS8440_A100-PCIE-40GBx10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A100-PCIE-40GBx10_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A100-PCIE-40GBx10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 26756,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "a419187e3b011604",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 34485106,
    "90.00 percentile latency (ns)": 54200763,
    "95.00 percentile latency (ns)": 60871148,
    "97.00 percentile latency (ns)": 65468084,
    "99.00 percentile latency (ns)": 74503184,
    "99.90 percentile latency (ns)": 90821065,
    "Completed samples per second": 12118.92,
    "Max latency (ns)": 118659024,
    "Mean latency (ns)": 36407229,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4320794,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 12119.41,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 12119.41,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1211.941,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1211.941,
    "ck_system": "DSS8440_A100-PCIE-40GBx10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A100-PCIE-40GBx10_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A100-PCIE-40GBx10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 12119,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "460464664a719c39",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 48483654,
    "90.00 percentile latency (ns)": 59664139,
    "95.00 percentile latency (ns)": 62364051,
    "97.00 percentile latency (ns)": 64190505,
    "99.00 percentile latency (ns)": 72421975,
    "99.90 percentile latency (ns)": 543628603186,
    "Completed samples per second": 12700.94,
    "Max latency (ns)": 606248212800,
    "Mean latency (ns)": 2975412929,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 3497278,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 12701.84,
    "accelerator_cooling_type": "",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "80GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-SXM-80GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 12701.84,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 3175.46,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 3175.46,
    "ck_system": "XE8545_7763_A100-SXM4-80GBx4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "500W A100-SXM-80GB",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE8545_7763_A100-SXM4-80GBx4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE8545_7763_A100-SXM4-80GBx4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 12700,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "4261af5a1f1c4959",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 44856233,
    "90.00 percentile latency (ns)": 56378344,
    "95.00 percentile latency (ns)": 59163821,
    "97.00 percentile latency (ns)": 61080643,
    "99.00 percentile latency (ns)": 69852209,
    "99.90 percentile latency (ns)": 544712172423,
    "Completed samples per second": 6852.37,
    "Max latency (ns)": 606485158466,
    "Mean latency (ns)": 2999141948,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4248384,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 6852.76,
    "accelerator_cooling_type": "",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "80GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-SXM-80GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 6852.76,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1713.19,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1713.19,
    "ck_system": "XE8545_7763_A100-SXM4-80GBx4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "500W A100-SXM-80GB",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE8545_7763_A100-SXM4-80GBx4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE8545_7763_A100-SXM4-80GBx4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 6850,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "05d6875243948b16",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 88287414,
    "90.00 percentile latency (ns)": 100879320,
    "95.00 percentile latency (ns)": 104431674,
    "97.00 percentile latency (ns)": 107022472,
    "99.00 percentile latency (ns)": 123276544,
    "99.90 percentile latency (ns)": 544530096262,
    "Completed samples per second": 12769.97,
    "Max latency (ns)": 606419884274,
    "Mean latency (ns)": 3102058402,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 3880874,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 12771.47,
    "accelerator_cooling_type": "",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-SXM-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 12771.47,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 3192.8675,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 3192.8675,
    "ck_system": "XE8545_7713_A100-SXM4-40GBx4",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE8545_7713_A100-SXM4-40GBx4",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE8545_7713_A100-SXM4-40GBx4",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 12770,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "1db68ad0e4a19889",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 49505761,
    "90.00 percentile latency (ns)": 73558724,
    "95.00 percentile latency (ns)": 81495576,
    "97.00 percentile latency (ns)": 87249535,
    "99.00 percentile latency (ns)": 115077689,
    "99.90 percentile latency (ns)": 538041877448,
    "Completed samples per second": 6300.8,
    "Max latency (ns)": 605403829107,
    "Mean latency (ns)": 3022496692,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4308926,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 6301.4,
    "accelerator_cooling_type": "",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-SXM-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 6301.4,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1575.35,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1575.35,
    "ck_system": "XE8545_7713_A100-SXM4-40GBx4",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE8545_7713_A100-SXM4-40GBx4",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE8545_7713_A100-SXM4-40GBx4",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 6300,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "5096170eeb04d408",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 91669054,
    "90.00 percentile latency (ns)": 104499561,
    "95.00 percentile latency (ns)": 108633392,
    "97.00 percentile latency (ns)": 111771061,
    "99.00 percentile latency (ns)": 127471935,
    "99.90 percentile latency (ns)": 545960141594,
    "Completed samples per second": 12699.97,
    "Max latency (ns)": 606308822619,
    "Mean latency (ns)": 3083279253,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 3522851,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 12701.84,
    "accelerator_cooling_type": "",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-SXM-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 12701.84,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 3175.46,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 3175.46,
    "ck_system": "XE8545_7763_A100-SXM4-40GBx4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE8545_7763_A100-SXM4-40GBx4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE8545_7763_A100-SXM4-40GBx4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 12700,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "5218c97139ff4d81",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 51978730,
    "90.00 percentile latency (ns)": 77177419,
    "95.00 percentile latency (ns)": 85343669,
    "97.00 percentile latency (ns)": 91237274,
    "99.00 percentile latency (ns)": 114748655,
    "99.90 percentile latency (ns)": 541226742241,
    "Completed samples per second": 6300.76,
    "Max latency (ns)": 605267120958,
    "Mean latency (ns)": 2905656501,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4376729,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 6301.4,
    "accelerator_cooling_type": "",
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-SXM-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 6301.4,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1575.35,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1575.35,
    "ck_system": "XE8545_7763_A100-SXM4-40GBx4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_cooling_type": "",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE8545_7763_A100-SXM4-40GBx4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE8545_7763_A100-SXM4-40GBx4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 6300,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "5d49c9f095ff43d9",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 76527804,
    "90.00 percentile latency (ns)": 100968043,
    "95.00 percentile latency (ns)": 108883018,
    "97.00 percentile latency (ns)": 114056527,
    "99.00 percentile latency (ns)": 123645171,
    "99.90 percentile latency (ns)": 138730205,
    "Completed samples per second": 10899.57,
    "Max latency (ns)": 162805271,
    "Mean latency (ns)": 78369269,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4202022,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 10900.72,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 10900.72,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 2725.18,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 2725.18,
    "ck_system": "R750xa_A100-PCIE-40GBx4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "256 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "4 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R750xa_A100-PCIE-40GBx4_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "preview",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R750xa_A100-PCIE-40GBx4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 10900,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "c4d5eacf3aa42b7b",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 47165338,
    "90.00 percentile latency (ns)": 70438221,
    "95.00 percentile latency (ns)": 79418911,
    "97.00 percentile latency (ns)": 85985797,
    "99.00 percentile latency (ns)": 99006481,
    "99.90 percentile latency (ns)": 120315764,
    "Completed samples per second": 5201.09,
    "Max latency (ns)": 150514624,
    "Mean latency (ns)": 50173610,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4172890,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 5201.46,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 5201.46,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1300.365,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1300.365,
    "ck_system": "R750xa_A100-PCIE-40GBx4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "256 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "4 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R750xa_A100-PCIE-40GBx4_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "preview",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R750xa_A100-PCIE-40GBx4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 5200,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "87ab0c6306324904",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 33471839,
    "90.00 percentile latency (ns)": 39987934,
    "95.00 percentile latency (ns)": 41595357,
    "97.00 percentile latency (ns)": 42635350,
    "99.00 percentile latency (ns)": 44692549,
    "99.90 percentile latency (ns)": 48971194,
    "Completed samples per second": 13899.51,
    "Max latency (ns)": 62034291,
    "Mean latency (ns)": 33656259,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4976969,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 13900.22,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA A40",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 13900.22,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1390.022,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1390.022,
    "ck_system": "DSS8440_A40x10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A40x10_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A40x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 13900,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "617a41a043f4804a",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 27323712,
    "90.00 percentile latency (ns)": 32916415,
    "95.00 percentile latency (ns)": 34455967,
    "97.00 percentile latency (ns)": 35466031,
    "99.00 percentile latency (ns)": 37456393,
    "99.90 percentile latency (ns)": 41019051,
    "Completed samples per second": 5801.83,
    "Max latency (ns)": 49703198,
    "Mean latency (ns)": 27535820,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 7265715,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 5802.07,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA A40",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "boot_firmware_version": "",
    "characteristics.scheduled_queries_per_second": 5802.07,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 580.207,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 580.207,
    "ck_system": "DSS8440_A40x10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "disk_controllers": "",
    "disk_drives": "",
    "division": "closed",
    "filesystem": "",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "management_firmware_version": "",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "network_speed_mbit": "",
    "nics_enabled_connected": "",
    "nics_enabled_firmware": "",
    "nics_enabled_os": "",
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A40x10_TRT",
    "number_of_nodes": 1,
    "number_of_type_nics_installed": "",
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_hardware": "",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "power_management": "",
    "power_supply_details": "",
    "power_supply_quantity_and_rating_watts": "",
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A40x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 5800,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "3b443a1a154ebc29",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 55621899,
    "90.00 percentile latency (ns)": 71952030,
    "95.00 percentile latency (ns)": 77077839,
    "97.00 percentile latency (ns)": 80463898,
    "99.00 percentile latency (ns)": 86921930,
    "99.90 percentile latency (ns)": 97939831,
    "Completed samples per second": 5160.72,
    "Max latency (ns)": 127030730,
    "Mean latency (ns)": 56552074,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4333338,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 5161.14,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 2,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 5161.14,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 2580.57,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 2580.57,
    "ck_system": "R7525_A100-PCIe-40GBx2_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.5 GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "4 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 2,
    "normalize_processors": 2,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R7525_A100-PCIe-40GBx2_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100-PCIe-40GBx2_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 5160,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "a398b9f4a06b77ab",
    "use_accelerator": true,
    "weight_data_types": "int8",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 36883399,
    "90.00 percentile latency (ns)": 51978010,
    "95.00 percentile latency (ns)": 57396941,
    "97.00 percentile latency (ns)": 61199599,
    "99.00 percentile latency (ns)": 69038264,
    "99.90 percentile latency (ns)": 86685933,
    "Completed samples per second": 2429.71,
    "Max latency (ns)": 126616091,
    "Mean latency (ns)": 38457892,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4298419,
    "Min queries satisfied": "Yes",
    "Mode": "PerformanceOnly",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2429.85,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIe-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 2,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2429.85,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1214.925,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1214.925,
    "ck_system": "R7525_A100-PCIe-40GBx2_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.3, CUDA 11.1",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.5 GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "4 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 600000,
    "min_query_count": 270336,
    "mlperf_version": 1.0,
    "normalize_cores": 2,
    "normalize_processors": 2,
    "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R7525_A100-PCIe-40GBx2_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": 0,
    "performance_issue_same_index": 0,
    "performance_issue_unique": 0,
    "performance_sample_count": 10833,
    "print_timestamps": 0,
    "problem": false,
    "qsl_rng_seed": 7322528924094909334,
    "retraining": "N",
    "sample_index_rng_seed": 1570999273408051088,
    "samples_per_query": 1,
    "schedule_rng_seed": 3507442325620259414,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100-PCIe-40GBx2_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 2430,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "2cbcc72e479f6cee",
    "use_accelerator": true,
    "weight_data_types": "fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 50622437,
    "90.00 percentile latency (ns)": 66224751,
    "95.00 percentile latency (ns)": 70874484,
    "97.00 percentile latency (ns)": 74075416,
    "99.00 percentile latency (ns)": 85386828,
    "99.90 percentile latency (ns)": 82330038387,
    "Completed samples per second": 2995.85,
    "Max latency (ns)": 91574583748,
    "Mean latency (ns)": 460482971,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 6431233,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2997.31,
    "accelerator_frequency": "1590MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "4 Accelerators per CPU",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Tesla T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 8,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2997.31,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 374.66375,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 374.66375,
    "ck_system": "R7525_T4x8_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "8x64GB DDR4-3200 HMAA8GR7AJR4N-XN RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "2MB+16MB+128MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.35GHz",
    "host_processor_interconnect": "Infinity Fabric",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB (5x800GB WUSTR6480ASS200 in RAID5)",
    "host_storage_type": "3D-TLC Solid State with 12Gbps SAS",
    "hw_notes": "ECC off",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 8,
    "normalize_processors": 8,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_T4x8_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_T4x8_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 3000,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "24fc2930f82a5f37",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 49799421,
    "90.00 percentile latency (ns)": 69774592,
    "95.00 percentile latency (ns)": 75814422,
    "97.00 percentile latency (ns)": 79942155,
    "99.00 percentile latency (ns)": 91840864,
    "99.90 percentile latency (ns)": 168924945196,
    "Completed samples per second": 1418.38,
    "Max latency (ns)": 191900326468,
    "Mean latency (ns)": 878198684,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 7327423,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 1418.73,
    "accelerator_frequency": "1590MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "4 Accelerators per CPU",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Tesla T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 8,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 1418.73,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 177.34125,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 177.34125,
    "ck_system": "R7525_T4x8_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "1 TB",
    "host_memory_configuration": "8x64GB DDR4-3200 HMAA8GR7AJR4N-XN RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "2MB+16MB+128MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.35GHz",
    "host_processor_interconnect": "Infinity Fabric",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3 TB (5x800GB WUSTR6480ASS200 in RAID5)",
    "host_storage_type": "3D-TLC Solid State with 12Gbps SAS",
    "hw_notes": "ECC off",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 8,
    "normalize_processors": 8,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_T4x8_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_T4x8_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 1420,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "654bde4371981a77",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 60186300,
    "90.00 percentile latency (ns)": 79894483,
    "95.00 percentile latency (ns)": 85740803,
    "97.00 percentile latency (ns)": 89704820,
    "99.00 percentile latency (ns)": 100014605,
    "99.90 percentile latency (ns)": 151761128325,
    "Completed samples per second": 1608.06,
    "Max latency (ns)": 168962900684,
    "Mean latency (ns)": 702448535,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 5540419,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 1608.55,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 1608.55,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 402.1375,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 402.1375,
    "ck_system": "XE2420_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/XE2420_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 1610,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "24ddab4a2255e3c4",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 57779198,
    "90.00 percentile latency (ns)": 82655786,
    "95.00 percentile latency (ns)": 90604825,
    "97.00 percentile latency (ns)": 96392744,
    "99.00 percentile latency (ns)": 113072155,
    "99.90 percentile latency (ns)": 322650067288,
    "Completed samples per second": 759.2,
    "Max latency (ns)": 359244396430,
    "Mean latency (ns)": 1528687481,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 6500371,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 759.32,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 759.32,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 189.83,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 189.83,
    "ck_system": "XE2420_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/XE2420_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 760,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "29ea143e56435d92",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 48445627,
    "90.00 percentile latency (ns)": 61747959,
    "95.00 percentile latency (ns)": 65731726,
    "97.00 percentile latency (ns)": 68831219,
    "99.00 percentile latency (ns)": 95409973,
    "99.90 percentile latency (ns)": 54261626636,
    "Completed samples per second": 8561.31,
    "Max latency (ns)": 60734141037,
    "Mean latency (ns)": 335381306,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4883942,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 8568.28,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 8,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 8568.28,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1071.035,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1071.035,
    "ck_system": "DSS8440_QuadroRTX8000x8_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 8,
    "normalize_processors": 8,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x8_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x8_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 8570,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "e9e8dcb62744a9a1",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 78235550,
    "90.00 percentile latency (ns)": 99547931,
    "95.00 percentile latency (ns)": 105222398,
    "97.00 percentile latency (ns)": 108895633,
    "99.00 percentile latency (ns)": 118432058,
    "99.90 percentile latency (ns)": 54825297605,
    "Completed samples per second": 4490.61,
    "Max latency (ns)": 60915739827,
    "Mean latency (ns)": 322424945,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 7191256,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 4495.97,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 8,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 4495.97,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 561.99625,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 561.99625,
    "ck_system": "DSS8440_QuadroRTX8000x8_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 8,
    "normalize_processors": 8,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x8_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x8_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 4500,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "7459a9fe6d70da4a",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 44205879,
    "90.00 percentile latency (ns)": 56644827,
    "95.00 percentile latency (ns)": 60271915,
    "97.00 percentile latency (ns)": 62691470,
    "99.00 percentile latency (ns)": 69583472,
    "99.90 percentile latency (ns)": 54011739788,
    "Completed samples per second": 10648.37,
    "Max latency (ns)": 62233405280,
    "Mean latency (ns)": 305060164,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4480764,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 10653.37,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "24 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 6000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 10653.37,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1065.337,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1065.337,
    "ck_system": "DSS8440_QuadroRTX6000x10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX6000x10_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX6000x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 10650,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "69b39c089443969e",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 68034310,
    "90.00 percentile latency (ns)": 87198086,
    "95.00 percentile latency (ns)": 92878838,
    "97.00 percentile latency (ns)": 96767360,
    "99.00 percentile latency (ns)": 109783526,
    "99.90 percentile latency (ns)": 54403892172,
    "Completed samples per second": 5598.7,
    "Max latency (ns)": 61851825348,
    "Mean latency (ns)": 332027584,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 7656199,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 5603.79,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "24 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 6000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 5603.79,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 560.379,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 560.379,
    "ck_system": "DSS8440_QuadroRTX6000x10_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.8 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX6000x10_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX6000x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 5600,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "06455e64d0184be3",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 52201536,
    "90.00 percentile latency (ns)": 70455025,
    "95.00 percentile latency (ns)": 75540736,
    "97.00 percentile latency (ns)": 78704923,
    "99.00 percentile latency (ns)": 84577450,
    "99.90 percentile latency (ns)": 93260696,
    "Completed samples per second": 5094.27,
    "Max latency (ns)": 102541658,
    "Mean latency (ns)": 53234327,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 2812877,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 5100.0,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIE-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 2,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 5100.0,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 2550.0,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 2550.0,
    "ck_system": "R7525_A100x2_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 2,
    "normalize_processors": 2,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x2_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x2_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 5100,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "f8facd1e584fa58f",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 34326765,
    "90.00 percentile latency (ns)": 46879274,
    "95.00 percentile latency (ns)": 51602493,
    "97.00 percentile latency (ns)": 54854678,
    "99.00 percentile latency (ns)": 61800177,
    "99.90 percentile latency (ns)": 72980177,
    "Completed samples per second": 2427.19,
    "Max latency (ns)": 85479680,
    "Mean latency (ns)": 35430064,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4044334,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2427.82,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIE-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 2,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2427.82,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1213.91,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1213.91,
    "ck_system": "R7525_A100x2_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 2,
    "normalize_processors": 2,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x2_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x2_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 2430,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "3e8f04856da64198",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 46637952,
    "90.00 percentile latency (ns)": 64658409,
    "95.00 percentile latency (ns)": 70355641,
    "97.00 percentile latency (ns)": 73993817,
    "99.00 percentile latency (ns)": 81120594,
    "99.90 percentile latency (ns)": 94314707,
    "Completed samples per second": 7488.91,
    "Max latency (ns)": 109124171,
    "Mean latency (ns)": 47480104,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 2863091,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 7496.5,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIE-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 7496.5,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 2498.8333333333335,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 2498.8333333333335,
    "ck_system": "R7525_A100x3_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.40GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.5 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "preview",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x3_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 7500,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "fa5039b5a45e390e",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 38101202,
    "90.00 percentile latency (ns)": 55258144,
    "95.00 percentile latency (ns)": 61504789,
    "97.00 percentile latency (ns)": 66018152,
    "99.00 percentile latency (ns)": 75571057,
    "99.90 percentile latency (ns)": 93558441,
    "Completed samples per second": 3694.99,
    "Max latency (ns)": 111676640,
    "Mean latency (ns)": 39592060,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 3969981,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 3696.68,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "40 GB",
    "accelerator_memory_configuration": "HBM2",
    "accelerator_model_name": "NVIDIA A100-PCIE-40GB",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 3696.68,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1232.2266666666667,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1232.2266666666667,
    "ck_system": "R7525_A100x3_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.40GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.5 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "preview",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x3_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 3700,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "14e2a948c96d1b5d",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 62826420,
    "90.00 percentile latency (ns)": 82805082,
    "95.00 percentile latency (ns)": 89179175,
    "97.00 percentile latency (ns)": 93721244,
    "99.00 percentile latency (ns)": 110483496,
    "99.90 percentile latency (ns)": 54220643662,
    "Completed samples per second": 10793.1,
    "Max latency (ns)": 61397661553,
    "Mean latency (ns)": 322960557,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4004284,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 10803.78,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 10803.78,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1080.3780000000002,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1080.3780000000002,
    "ck_system": "DSS8440_QuadroRTX8000x10_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x10_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 10800,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "864219af6c96e388",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 69885963,
    "90.00 percentile latency (ns)": 92846374,
    "95.00 percentile latency (ns)": 99890754,
    "97.00 percentile latency (ns)": 104785590,
    "99.00 percentile latency (ns)": 120677256,
    "99.90 percentile latency (ns)": 54363807544,
    "Completed samples per second": 5594.53,
    "Max latency (ns)": 61276712414,
    "Mean latency (ns)": 317623322,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 6174448,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 5603.79,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 10,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 5603.79,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 560.379,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 560.379,
    "ck_system": "DSS8440_QuadroRTX8000x10_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.10GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 10,
    "normalize_processors": 10,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x10_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x10_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 5600,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "d489c836a925b4ae",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 62671129,
    "90.00 percentile latency (ns)": 80136392,
    "95.00 percentile latency (ns)": 85228977,
    "97.00 percentile latency (ns)": 88699210,
    "99.00 percentile latency (ns)": 101583361,
    "99.90 percentile latency (ns)": 54820233434,
    "Completed samples per second": 6230.44,
    "Max latency (ns)": 61424676949,
    "Mean latency (ns)": 326515125,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 5630362,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 6237.43,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 16,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 6237.43,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 389.839375,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 389.839375,
    "ck_system": "DSS8440_T4x16_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "ECC off",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 16,
    "normalize_processors": 16,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x16_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x16_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 6231,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "2a790211c4c1ab57",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 59641223,
    "90.00 percentile latency (ns)": 79191473,
    "95.00 percentile latency (ns)": 84712108,
    "97.00 percentile latency (ns)": 88431699,
    "99.00 percentile latency (ns)": 101264038,
    "99.90 percentile latency (ns)": 82630814058,
    "Completed samples per second": 2972.88,
    "Max latency (ns)": 92322040799,
    "Mean latency (ns)": 472109661,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 6182853,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2974.33,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 16,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2974.33,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 185.895625,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 185.895625,
    "ck_system": "DSS8440_T4x16_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "ECC off",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 16,
    "normalize_processors": 16,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x16_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x16_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 2977,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "44b29c1a324556a7",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 40330487,
    "90.00 percentile latency (ns)": 56028622,
    "95.00 percentile latency (ns)": 61534865,
    "97.00 percentile latency (ns)": 65771795,
    "99.00 percentile latency (ns)": 92427474,
    "99.90 percentile latency (ns)": 192128576195,
    "Completed samples per second": 1278.59,
    "Max latency (ns)": 213754610484,
    "Mean latency (ns)": 1093446528,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 5153862,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 1278.85,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 1278.85,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 319.7125,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 319.7125,
    "ck_system": "R7515_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "256 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 1,
    "host_storage_capacity": "3.2 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7515_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.27, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7515_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 1280,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 120,
    "uid": "8b25606feb0c4aac",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 40220099,
    "90.00 percentile latency (ns)": 60048816,
    "95.00 percentile latency (ns)": 66741253,
    "97.00 percentile latency (ns)": 71656809,
    "99.00 percentile latency (ns)": 102742257,
    "99.90 percentile latency (ns)": 383344939389,
    "Completed samples per second": 629.38,
    "Max latency (ns)": 432589190158,
    "Mean latency (ns)": 2178532488,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 6193734,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 629.43,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 629.43,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 157.3575,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 157.3575,
    "ck_system": "R7515_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "256 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 1,
    "host_storage_capacity": "3.2 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC on",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7515_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.27, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7515_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 630,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 120,
    "uid": "647278fbbb46a4d6",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 72737669,
    "90.00 percentile latency (ns)": 96113963,
    "95.00 percentile latency (ns)": 102924540,
    "97.00 percentile latency (ns)": 107412065,
    "99.00 percentile latency (ns)": 119053861,
    "99.90 percentile latency (ns)": 50022270186,
    "Completed samples per second": 4739.74,
    "Max latency (ns)": 61251299713,
    "Mean latency (ns)": 291062342,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 7323240,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 4747.46,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 12,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 4747.46,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 395.62166666666667,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 395.62166666666667,
    "ck_system": "DSS8440_T4x12_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 12,
    "normalize_processors": 12,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x12_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x12_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 4750,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "c0d84438a9113a4c",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 55002816,
    "90.00 percentile latency (ns)": 74183429,
    "95.00 percentile latency (ns)": 79597222,
    "97.00 percentile latency (ns)": 83167870,
    "99.00 percentile latency (ns)": 93137650,
    "99.90 percentile latency (ns)": 105075230277,
    "Completed samples per second": 2197.13,
    "Max latency (ns)": 123925110794,
    "Mean latency (ns)": 510494635,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 7725217,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2198.03,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 12,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2198.03,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 183.16916666666668,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 183.16916666666668,
    "ck_system": "DSS8440_T4x12_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0 Update 1",
    "host_memory_capacity": "768 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1 TB",
    "host_storage_type": "NVMe SSD",
    "hw_notes": "ECC off",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 12,
    "normalize_processors": 12,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x12_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x12_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 2200,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "277b3375bdd2dd4c",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 49687917,
    "90.00 percentile latency (ns)": 64518901,
    "95.00 percentile latency (ns)": 68584136,
    "97.00 percentile latency (ns)": 71203988,
    "99.00 percentile latency (ns)": 78957425,
    "99.90 percentile latency (ns)": 58023786556,
    "Completed samples per second": 4232.81,
    "Max latency (ns)": 65068620121,
    "Mean latency (ns)": 331704592,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4425494,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 4236.2,
    "accelerator_frequency": "1770MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "2 Accelerators per CPU",
    "accelerator_memory_capacity": "24 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 6000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 4236.2,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 1059.05,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 1059.05,
    "ck_system": "C4140_QuadroRTX6000x4_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "6x16GB DDR4-2666 HMA82GR7AFR8N-VK RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "1.25MB+20MB+27.5MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.40GHz",
    "host_processor_interconnect": "Ultra Path Interconnect",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.6 TB (1x1.6TB Dell Express Flash PM1725a 1.6TB AIC)",
    "host_storage_type": "3D-TLC Solid State with PCIe NVME x8 Interface",
    "hw_notes": "ECC off. RTX6000 is available as a special config thru Dell DSS or OEM for PowerEdge C4140",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/C4140_QuadroRTX6000x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/C4140_QuadroRTX6000x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 4240,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "75a68c6242a2a8b5",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 79099518,
    "90.00 percentile latency (ns)": 104105469,
    "95.00 percentile latency (ns)": 111561867,
    "97.00 percentile latency (ns)": 116473350,
    "99.00 percentile latency (ns)": 128654340,
    "99.90 percentile latency (ns)": 107437904771,
    "Completed samples per second": 2276.55,
    "Max latency (ns)": 120580537071,
    "Mean latency (ns)": 563811152,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 9740367,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2277.95,
    "accelerator_frequency": "1770MHz",
    "accelerator_host_interconnect": "PCI Express 3.0",
    "accelerator_interconnect": "PCI Express 3.0",
    "accelerator_interconnect_topology": "2 Accelerators per CPU",
    "accelerator_memory_capacity": "24 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 6000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2277.95,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 569.4875,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 569.4875,
    "ck_system": "C4140_QuadroRTX6000x4_TRT",
    "ck_used": true,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "6x16GB DDR4-2666 HMA82GR7AFR8N-VK RDIMM ECC",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "1.25MB+20MB+27.5MB",
    "host_processor_core_count": 120,
    "host_processor_frequency": "2.40GHz",
    "host_processor_interconnect": "Ultra Path Interconnect",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.6 TB (1x1.6TB Dell Express Flash PM1725a 1.6TB AIC)",
    "host_storage_type": "3D-TLC Solid State with PCIe NVME x8 Interface",
    "hw_notes": "ECC off. RTX6000 is available as a special config thru Dell DSS or OEM for PowerEdge C4140",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/C4140_QuadroRTX6000x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/C4140_QuadroRTX6000x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 2280,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "59f8e74e7ca8596e",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 55570921,
    "90.00 percentile latency (ns)": 71191203,
    "95.00 percentile latency (ns)": 76293046,
    "97.00 percentile latency (ns)": 79813477,
    "99.00 percentile latency (ns)": 89955459,
    "99.90 percentile latency (ns)": 86836487424,
    "Completed samples per second": 2806.3,
    "Max latency (ns)": 96987521184,
    "Mean latency (ns)": 457179084,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 4885947,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 2807.48,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 2807.48,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 935.8266666666667,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 935.8266666666667,
    "ck_system": "R7525_QuadroRTX8000x3_TRT",
    "ck_used": false,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 32,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7502",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "CentOS Linux release 8.1.1911",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT",
    "system_name": "Dell EMC PowerEdge R7525 (3x Quadro RTX 8000)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 2810,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 64,
    "uid": "1765a646648cd8d7",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 60473686,
    "90.00 percentile latency (ns)": 80392596,
    "95.00 percentile latency (ns)": 86818940,
    "97.00 percentile latency (ns)": 91380074,
    "99.00 percentile latency (ns)": 103596513,
    "99.90 percentile latency (ns)": 148287221433,
    "Completed samples per second": 1647.87,
    "Max latency (ns)": 164662760681,
    "Mean latency (ns)": 665930485,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 8947861,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 1648.52,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "48 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA Quadro RTX 8000",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 3,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 1648.52,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 549.5066666666667,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 549.5066666666667,
    "ck_system": "R7525_QuadroRTX8000x3_TRT",
    "ck_used": false,
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch",
    "host_memory_capacity": "512 GB",
    "host_memory_configuration": "",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 32,
    "host_processor_frequency": "2.50GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7502",
    "host_processors_per_node": 2,
    "host_storage_capacity": "1.84 TB",
    "host_storage_type": "NVMe",
    "hw_notes": "",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 3,
    "normalize_processors": 3,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT",
    "number_of_nodes": 1,
    "operating_system": "CentOS Linux release 8.1.1911",
    "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT",
    "system_name": "Dell EMC PowerEdge R7525 (3x Quadro RTX 8000)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 1650,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 64,
    "uid": "5869390ef0408780",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 44960755,
    "90.00 percentile latency (ns)": 62450343,
    "95.00 percentile latency (ns)": 68507723,
    "97.00 percentile latency (ns)": 73300496,
    "99.00 percentile latency (ns)": 112720840,
    "99.90 percentile latency (ns)": 181074885228,
    "Completed samples per second": 1348.4,
    "Max latency (ns)": 202350277680,
    "Mean latency (ns)": 1038018269,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 5304967,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 1348.79,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 1348.79,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 337.1975,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 337.1975,
    "ck_system": "R740_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.0,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "3.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "SSD",
    "hw_notes": "ECC on",
    "informal_model": "bert-99",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R740_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.207, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 1350,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "b61bf741af8cb698",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  },
  {
    "50.00 percentile latency (ns)": 46738588,
    "90.00 percentile latency (ns)": 67948475,
    "95.00 percentile latency (ns)": 74626771,
    "97.00 percentile latency (ns)": 79444741,
    "99.00 percentile latency (ns)": 102898658,
    "99.90 percentile latency (ns)": 362314013075,
    "Completed samples per second": 679.32,
    "Max latency (ns)": 400282968188,
    "Mean latency (ns)": 2000566759,
    "Min duration satisfied": "Yes",
    "Min latency (ns)": 6150344,
    "Min queries satisfied": "Yes",
    "Mode": "Performance",
    "Performance constraints satisfied": "Yes",
    "Result is": "VALID",
    "SUT name": "BERT SERVER",
    "Scenario": "server",
    "Scheduled samples per second": 679.39,
    "accelerator_frequency": "",
    "accelerator_host_interconnect": "",
    "accelerator_interconnect": "",
    "accelerator_interconnect_topology": "",
    "accelerator_memory_capacity": "16 GB",
    "accelerator_memory_configuration": "GDDR6",
    "accelerator_model_name": "NVIDIA T4",
    "accelerator_on-chip_memories": "",
    "accelerators_per_node": 4,
    "accuracy_log_probability": 0,
    "accuracy_log_rng_seed": 0,
    "accuracy_log_sampling_target": 0,
    "characteristics.scheduled_queries_per_second": 679.39,
    "characteristics.scheduled_queries_per_second.normalized_per_core": 169.8475,
    "characteristics.scheduled_queries_per_second.normalized_per_processor": 169.8475,
    "ck_system": "R740_T4x4_TRT",
    "ck_used": true,
    "cooling": "",
    "dataset": "SQuAD v1.1",
    "dataset_link": "",
    "dim_x_default": "seq_number",
    "dim_y_default": "characteristics.scheduled_queries_per_second",
    "dim_y_maximize": false,
    "division": "closed",
    "formal_model": "bert",
    "formal_model_accuracy": 99.9,
    "formal_model_link": "",
    "framework": "TensorRT 7.2.0.14, CUDA 11.0.207",
    "host_memory_capacity": "384 GB",
    "host_memory_configuration": "DDR-4",
    "host_networking": "",
    "host_networking_topology": "",
    "host_processor_caches": "",
    "host_processor_core_count": 120,
    "host_processor_frequency": "3.0GHz",
    "host_processor_interconnect": "",
    "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor",
    "host_processors_per_node": 2,
    "host_storage_capacity": "3.84 TB",
    "host_storage_type": "SSD",
    "hw_notes": "ECC on",
    "informal_model": "bert-99.9",
    "input_data_types": "int32",
    "max_async_queries": 0,
    "max_duration (ms)": 0,
    "max_query_count": 0,
    "min_duration (ms)": 60000,
    "min_query_count": 270336,
    "mlperf_version": 0.7,
    "normalize_cores": 4,
    "normalize_processors": 4,
    "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code",
    "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R740_T4x4_TRT",
    "number_of_nodes": 1,
    "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)",
    "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.207, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10",
    "performance_issue_same": true,
    "performance_issue_same_index": 0,
    "performance_issue_unique": true,
    "performance_sample_count": 10833,
    "print_timestamps": true,
    "problem": false,
    "qsl_rng_seed": 12786827339337101903,
    "retraining": "N",
    "sample_index_rng_seed": 12640797754436136668,
    "samples_per_query": 1,
    "schedule_rng_seed": 3135815929913719677,
    "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx",
    "status": "available",
    "submitter": "DellEMC",
    "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC",
    "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)",
    "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_T4x4_TRT",
    "system_name": "Microsoft Corporation 7.0 (Virtual Machine)",
    "system_type": "datacenter",
    "target_latency (ns)": 130000000,
    "target_qps": 680,
    "task": "NLP",
    "task2": "nlp",
    "total_cores": 240,
    "uid": "505cc454d14a575b",
    "use_accelerator": true,
    "weight_data_types": "int8,fp16",
    "weight_transformations": "quantization, affine fusion"
  }
]