[ { "50.00 percentile latency (ns)": 439667024628, "90.00 percentile latency (ns)": 648298292867, "95.00 percentile latency (ns)": 666446479650, "97.00 percentile latency (ns)": 672388900342, "99.00 percentile latency (ns)": 677465162415, "99.90 percentile latency (ns)": 679205418372, "Max latency (ns)": 679273624423, "Mean latency (ns)": 413365931683, "Min duration satisfied": "Yes", "Min latency (ns)": 1624191686, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1690.63, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1690.63, "characteristics.samples_per_second.normalized_per_core": 1690.63, "characteristics.samples_per_second.normalized_per_processor": 1690.63, "ck_system": "A30x1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30x1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1148399, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A30, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1740, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "4dc2874d87ae432c", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 389364727683, "90.00 percentile latency (ns)": 573424687443, "95.00 percentile latency (ns)": 590715431656, "97.00 percentile latency (ns)": 596738795511, "99.00 percentile latency (ns)": 602015928685, "99.90 percentile latency (ns)": 604018970276, "Max latency (ns)": 604194879923, "Mean latency (ns)": 366665543997, "Min duration satisfied": "Yes", "Min latency (ns)": 406301230, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 48.064, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 14.236475165562917, "characteristics.power.normalized_per_core": 14.236475165562917, "characteristics.power.normalized_per_processor": 14.236475165562917, "characteristics.samples_per_second": 48.064, "characteristics.samples_per_second.normalized_per_core": 48.064, "characteristics.samples_per_second.normalized_per_processor": 48.064, "ck_system": "Xavier_NX_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline scenario", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/Xavier_NX_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04", "other_hardware": "", "other_software_stack": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2, cuDNN 8.2.3, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 29040, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT_MaxQ", "system_name": "Auvidea JNX30 Xavier NX (MaxQ, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 44, "task": "NLP", "task2": "nlp", "total_cores": 6, "uid": "5a09d4c30a688f3e", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 425870261176, "90.00 percentile latency (ns)": 634048846210, "95.00 percentile latency (ns)": 654125918890, "97.00 percentile latency (ns)": 661123477969, "99.00 percentile latency (ns)": 667147015950, "99.90 percentile latency (ns)": 669326860411, "Max latency (ns)": 669566995619, "Mean latency (ns)": 402652588335, "Min duration satisfied": "Yes", "Min latency (ns)": 233859813, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 95.614, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 95.614, "characteristics.samples_per_second.normalized_per_core": 95.614, "characteristics.samples_per_second.normalized_per_processor": 95.614, "ck_system": "AGX_Xavier_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline scenario", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04", "other_hardware": "", "other_software_stack": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2, cuDNN 8.2.3, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 64020, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT_Triton", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 97, "task": "NLP", "task2": "nlp", "total_cores": 8, "uid": "3f9cadb49e2e9977", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 449602781082, "90.00 percentile latency (ns)": 665334842738, "95.00 percentile latency (ns)": 684238326042, "97.00 percentile latency (ns)": 690689686447, "99.00 percentile latency (ns)": 695947494267, "99.90 percentile latency (ns)": 697782538712, "Max latency (ns)": 697937628842, "Mean latency (ns)": 423029507005, "Min duration satisfied": "Yes", "Min latency (ns)": 1263304105, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 2836.93, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2836.93, "characteristics.samples_per_second.normalized_per_core": 2836.93, "characteristics.samples_per_second.normalized_per_processor": 2836.93, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1980000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3000, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "9a7e55ea4bcca489", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 423020818747, "90.00 percentile latency (ns)": 627438645441, "95.00 percentile latency (ns)": 646997189277, "97.00 percentile latency (ns)": 653762979285, "99.00 percentile latency (ns)": 659553104447, "99.90 percentile latency (ns)": 661643347173, "Max latency (ns)": 661870369807, "Mean latency (ns)": 399353252955, "Min duration satisfied": "Yes", "Min latency (ns)": 209809508, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 96.7259, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 96.7259, "characteristics.samples_per_second.normalized_per_core": 96.7259, "characteristics.samples_per_second.normalized_per_processor": 96.7259, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline scenario", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04", "other_hardware": "", "other_software_stack": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2, cuDNN 8.2.3, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 64020, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 97, "task": "NLP", "task2": "nlp", "total_cores": 8, "uid": "c7a4e55a0fe22252", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 423332036762, "90.00 percentile latency (ns)": 626489346901, "95.00 percentile latency (ns)": 644293666544, "97.00 percentile latency (ns)": 650290824950, "99.00 percentile latency (ns)": 655251452810, "99.90 percentile latency (ns)": 657074772974, "Max latency (ns)": 657225613752, "Mean latency (ns)": 398552692765, "Min duration satisfied": "Yes", "Min latency (ns)": 362163045, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 502.111, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 502.111, "characteristics.samples_per_second.normalized_per_core": 502.111, "characteristics.samples_per_second.normalized_per_processor": 502.111, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 330000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_Triton", "system_name": "NVIDIA DGX A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 500, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "2b44eed8de4c7608", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 448967970337, "90.00 percentile latency (ns)": 661937709537, "95.00 percentile latency (ns)": 680137010416, "97.00 percentile latency (ns)": 686327387031, "99.00 percentile latency (ns)": 691357406178, "99.90 percentile latency (ns)": 693136866882, "Max latency (ns)": 693247125882, "Mean latency (ns)": 422147705409, "Min duration satisfied": "Yes", "Min latency (ns)": 2022533147, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 3236.94, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 3236.94, "characteristics.samples_per_second.normalized_per_core": 3236.94, "characteristics.samples_per_second.normalized_per_processor": 3236.94, "ck_system": "A100-PCIe-80GBx1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GBx1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 2244000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GBx1_TRT", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe-80GB, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "b58ca444e996ecfe", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 413215124648, "90.00 percentile latency (ns)": 609852381315, "95.00 percentile latency (ns)": 626992076592, "97.00 percentile latency (ns)": 632724807913, "99.00 percentile latency (ns)": 637397512382, "99.90 percentile latency (ns)": 639029148177, "Max latency (ns)": 639181569173, "Mean latency (ns)": 388540597266, "Min duration satisfied": "Yes", "Min latency (ns)": 1434385879, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 3614, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 3614, "characteristics.samples_per_second.normalized_per_core": 3614.0, "characteristics.samples_per_second.normalized_per_processor": 3614.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 2310000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT", "system_name": "NVIDIA DGX A100 (1x A100-SXM-80GB, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3500, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "7a8cff4cb78692fb", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 434839520829, "90.00 percentile latency (ns)": 643181212381, "95.00 percentile latency (ns)": 662907311978, "97.00 percentile latency (ns)": 669805140641, "99.00 percentile latency (ns)": 675826577690, "99.90 percentile latency (ns)": 678054302564, "Max latency (ns)": 678303635981, "Mean latency (ns)": 410033236208, "Min duration satisfied": "Yes", "Min latency (ns)": 335899656, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 60.327, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 60.327, "characteristics.samples_per_second.normalized_per_core": 60.327, "characteristics.samples_per_second.normalized_per_processor": 60.327, "ck_system": "Xavier_NX_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline scenario", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/Xavier_NX_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04", "other_hardware": "", "other_software_stack": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2, cuDNN 8.2.3, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 40920, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT_Triton", "system_name": "NVIDIA Jetson Xavier NX (TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 62, "task": "NLP", "task2": "nlp", "total_cores": 6, "uid": "d3888e2bf2794bca", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 427532890645, "90.00 percentile latency (ns)": 632503631039, "95.00 percentile latency (ns)": 650389492159, "97.00 percentile latency (ns)": 656358148610, "99.00 percentile latency (ns)": 661344563595, "99.90 percentile latency (ns)": 663095370303, "Max latency (ns)": 663241514131, "Mean latency (ns)": 402558541435, "Min duration satisfied": "Yes", "Min latency (ns)": 504879473, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 502.532, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30 (1x1g.6gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 502.532, "characteristics.samples_per_second.normalized_per_core": 502.532, "characteristics.samples_per_second.normalized_per_processor": 502.532, "ck_system": "A30-MIG_1x1g.6gb_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30-MIG_1x1g.6gb_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 333300, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30-MIG_1x1g.6gb_TRT", "system_name": "Gigabyte G482-Z54 (1x A30-MIG-1x1g.6gb, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 505, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "387e7cedc166bc4c", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 421023900613, "90.00 percentile latency (ns)": 622551867590, "95.00 percentile latency (ns)": 641591964388, "97.00 percentile latency (ns)": 648230718975, "99.00 percentile latency (ns)": 653907236887, "99.90 percentile latency (ns)": 655987022604, "Max latency (ns)": 656208156146, "Mean latency (ns)": 397176698709, "Min duration satisfied": "Yes", "Min latency (ns)": 325209874, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 61.3525, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 18.88824048706243, "characteristics.power.normalized_per_core": 18.88824048706243, "characteristics.power.normalized_per_processor": 18.88824048706243, "characteristics.samples_per_second": 61.3525, "characteristics.samples_per_second.normalized_per_core": 61.3525, "characteristics.samples_per_second.normalized_per_processor": 61.3525, "ck_system": "AGX_Xavier_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline scenario", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04", "other_hardware": "", "other_software_stack": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2, cuDNN 8.2.3, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 40260, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT_MaxQ", "system_name": "Auvidea X220-LC AGX Xavier 32GB (MaxQ, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 61, "task": "NLP", "task2": "nlp", "total_cores": 8, "uid": "65d64fae8a706d2b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 392583238066, "90.00 percentile latency (ns)": 581245423872, "95.00 percentile latency (ns)": 597802715820, "97.00 percentile latency (ns)": 603466063073, "99.00 percentile latency (ns)": 608135544258, "99.90 percentile latency (ns)": 609764157299, "Max latency (ns)": 609901119031, "Mean latency (ns)": 369540236528, "Min duration satisfied": "Yes", "Min latency (ns)": 1172630878, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 3246.43, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 3246.43, "characteristics.samples_per_second.normalized_per_core": 3246.43, "characteristics.samples_per_second.normalized_per_processor": 3246.43, "ck_system": "A100-PCIe-80GBx1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GBx1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1980000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GBx1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe-80GB, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3000, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "215c37b18f984b11", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 416872867335, "90.00 percentile latency (ns)": 616944022950, "95.00 percentile latency (ns)": 634777727368, "97.00 percentile latency (ns)": 640655850813, "99.00 percentile latency (ns)": 645527028638, "99.90 percentile latency (ns)": 647254692128, "Max latency (ns)": 647367563400, "Mean latency (ns)": 391843863836, "Min duration satisfied": "Yes", "Min latency (ns)": 680324258, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1121.46, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1121.46, "characteristics.samples_per_second.normalized_per_core": 1121.46, "characteristics.samples_per_second.normalized_per_processor": 1121.46, "ck_system": "A10x1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A10x1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 726000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x1_TRT_Triton", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x A10, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1100, "task": "NLP", "task2": "nlp", "total_cores": 56, "uid": "a80c0d3ff4750a2c", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 413524353675, "90.00 percentile latency (ns)": 612201838259, "95.00 percentile latency (ns)": 629691636808, "97.00 percentile latency (ns)": 635582078783, "99.00 percentile latency (ns)": 640442380389, "99.90 percentile latency (ns)": 642173150228, "Max latency (ns)": 642335489282, "Mean latency (ns)": 389298669064, "Min duration satisfied": "Yes", "Min latency (ns)": 1367059440, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 3596.25, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 3596.25, "characteristics.samples_per_second.normalized_per_core": 3596.25, "characteristics.samples_per_second.normalized_per_processor": 3596.25, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 2310000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_Triton", "system_name": "NVIDIA DGX A100 (1x A100-SXM-80GB, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3500, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "38a544354f06e63f", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 421338722550, "90.00 percentile latency (ns)": 622987800681, "95.00 percentile latency (ns)": 640822058991, "97.00 percentile latency (ns)": 646658455737, "99.00 percentile latency (ns)": 651392312506, "99.90 percentile latency (ns)": 653088045869, "Max latency (ns)": 653195551058, "Mean latency (ns)": 396040135678, "Min duration satisfied": "Yes", "Min latency (ns)": 1175585457, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 1111.46, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1111.46, "characteristics.samples_per_second.normalized_per_core": 1111.46, "characteristics.samples_per_second.normalized_per_processor": 1111.46, "ck_system": "A10x1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A10x1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 726000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x1_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x A10, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1100, "task": "NLP", "task2": "nlp", "total_cores": 56, "uid": "f8e8bafe8d48cea3", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 502688462760, "90.00 percentile latency (ns)": 739650488758, "95.00 percentile latency (ns)": 760307593322, "97.00 percentile latency (ns)": 767264383346, "99.00 percentile latency (ns)": 772870571915, "99.90 percentile latency (ns)": 774550640552, "Max latency (ns)": 774702954827, "Mean latency (ns)": 472871195910, "Min duration satisfied": "Yes", "Min latency (ns)": 3040214121, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 1680.02, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1680.02, "characteristics.samples_per_second.normalized_per_core": 1680.02, "characteristics.samples_per_second.normalized_per_processor": 1680.02, "ck_system": "A30x1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30x1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1301519, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x1_TRT", "system_name": "Gigabyte G482-Z54 (1x A30, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1972, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "35bdae64e21892a2", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 424993847409, "90.00 percentile latency (ns)": 626708974348, "95.00 percentile latency (ns)": 644217495796, "97.00 percentile latency (ns)": 650154588492, "99.00 percentile latency (ns)": 654981710647, "99.90 percentile latency (ns)": 656725023405, "Max latency (ns)": 656860098026, "Mean latency (ns)": 399540402964, "Min duration satisfied": "Yes", "Min latency (ns)": 644300544, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 502.39, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 502.39, "characteristics.samples_per_second.normalized_per_core": 502.39, "characteristics.samples_per_second.normalized_per_processor": 502.39, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 330000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT", "system_name": "NVIDIA DGX A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 500, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "354479787004c223", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 426776424304, "90.00 percentile latency (ns)": 630950693825, "95.00 percentile latency (ns)": 648781639335, "97.00 percentile latency (ns)": 654746497326, "99.00 percentile latency (ns)": 659718871739, "99.90 percentile latency (ns)": 661466049044, "Max latency (ns)": 661614193223, "Mean latency (ns)": 401654930319, "Min duration satisfied": "Yes", "Min latency (ns)": 500527441, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 503.768, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30 (1x1g.6gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 503.768, "characteristics.samples_per_second.normalized_per_core": 503.768, "characteristics.samples_per_second.normalized_per_processor": 503.768, "ck_system": "A30-MIG_1x1g.6gb_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30-MIG_1x1g.6gb_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 333300, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30-MIG_1x1g.6gb_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A30-MIG-1x1g.6gb, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 505, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "2e8ab116392a1ff4", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 515728128858, "90.00 percentile latency (ns)": 760586876985, "95.00 percentile latency (ns)": 781667981655, "97.00 percentile latency (ns)": 788791889736, "99.00 percentile latency (ns)": 794560757767, "99.90 percentile latency (ns)": 796596716741, "Max latency (ns)": 796723659419, "Mean latency (ns)": 484809435456, "Min duration satisfied": "Yes", "Min latency (ns)": 2181611530, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 2816.53, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2816.53, "characteristics.samples_per_second.normalized_per_core": 2816.53, "characteristics.samples_per_second.normalized_per_processor": 2816.53, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 2244000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "4fd5af84ee724085", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 429533807976, "90.00 percentile latency (ns)": 633286576987, "95.00 percentile latency (ns)": 652374648732, "97.00 percentile latency (ns)": 658979788228, "99.00 percentile latency (ns)": 664718845367, "99.90 percentile latency (ns)": 666832239453, "Max latency (ns)": 667067102182, "Mean latency (ns)": 404713334130, "Min duration satisfied": "Yes", "Min latency (ns)": 314212687, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 61.3432, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 61.3432, "characteristics.samples_per_second.normalized_per_core": 61.3432, "characteristics.samples_per_second.normalized_per_processor": 61.3432, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline scenario", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04", "other_hardware": "", "other_software_stack": "JetPack 4.6, TensorRT 8.0.1, CUDA 10.2, cuDNN 8.2.3, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 40920, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 62, "task": "NLP", "task2": "nlp", "total_cores": 6, "uid": "64808b3f248abb33", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 508531005400, "90.00 percentile latency (ns)": 749477546180, "95.00 percentile latency (ns)": 770679185705, "97.00 percentile latency (ns)": 777599670507, "99.00 percentile latency (ns)": 783477565227, "99.90 percentile latency (ns)": 785434556526, "Max latency (ns)": 785594512699, "Mean latency (ns)": 477961460814, "Min duration satisfied": "Yes", "Min latency (ns)": 1681485980, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1656.73, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1656.73, "characteristics.samples_per_second.normalized_per_core": 1656.73, "characteristics.samples_per_second.normalized_per_processor": 1656.73, "ck_system": "A30x1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30x1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1301519, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A30, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1972, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "ccb666417acf48b0", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 434165070118, "90.00 percentile latency (ns)": 655703910047, "95.00 percentile latency (ns)": 678619624977, "97.00 percentile latency (ns)": 687506228566, "99.00 percentile latency (ns)": 695064807667, "99.90 percentile latency (ns)": 697743569823, "Max latency (ns)": 698060595522, "Mean latency (ns)": 413714727203, "Min duration satisfied": "Yes", "Min latency (ns)": 144326750, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 41.601, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 13.709243553008596, "characteristics.power.normalized_per_core": 13.709243553008596, "characteristics.power.normalized_per_processor": 13.709243553008596, "characteristics.samples_per_second": 41.601, "characteristics.samples_per_second.normalized_per_core": 41.601, "characteristics.samples_per_second.normalized_per_processor": 41.601, "ck_system": "Xavier_NX_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/Xavier_NX_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 29040, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT_MaxQ", "system_name": "NVIDIA Jetson Xavier NX (MaxQ, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 44, "task": "NLP", "task2": "nlp", "total_cores": 6, "uid": "447496c03233ddbd", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 497620915340, "90.00 percentile latency (ns)": 735936523544, "95.00 percentile latency (ns)": 756835798090, "97.00 percentile latency (ns)": 763898537499, "99.00 percentile latency (ns)": 769688042913, "99.90 percentile latency (ns)": 771766033936, "Max latency (ns)": 771896633569, "Mean latency (ns)": 468116605104, "Min duration satisfied": "Yes", "Min latency (ns)": 1314752548, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 2907.12, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2907.12, "characteristics.samples_per_second.normalized_per_core": 2907.12, "characteristics.samples_per_second.normalized_per_processor": 2907.12, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 2244000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "df84837a2106957c", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 390706236762, "90.00 percentile latency (ns)": 584992958534, "95.00 percentile latency (ns)": 603912871602, "97.00 percentile latency (ns)": 610732022733, "99.00 percentile latency (ns)": 616611928112, "99.90 percentile latency (ns)": 618708009142, "Max latency (ns)": 618960854899, "Mean latency (ns)": 370464167332, "Min duration satisfied": "Yes", "Min latency (ns)": 121766195, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 91.7021, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 91.7021, "characteristics.samples_per_second.normalized_per_core": 91.7021, "characteristics.samples_per_second.normalized_per_processor": 91.7021, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 56760, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 86, "task": "NLP", "task2": "nlp", "total_cores": 8, "uid": "1bd946c47bba043b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 414751262194, "90.00 percentile latency (ns)": 611928728581, "95.00 percentile latency (ns)": 629088349076, "97.00 percentile latency (ns)": 634830679036, "99.00 percentile latency (ns)": 639522200444, "99.90 percentile latency (ns)": 641164974875, "Max latency (ns)": 641317434545, "Mean latency (ns)": 389907256763, "Min duration satisfied": "Yes", "Min latency (ns)": 1476888473, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 3601.96, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 3601.96, "characteristics.samples_per_second.normalized_per_core": 3601.96, "characteristics.samples_per_second.normalized_per_processor": 3601.96, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_edge", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_edge", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 2310000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_edge", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3500, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "9255d3e5cca423d9", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 423556811432, "90.00 percentile latency (ns)": 626977997481, "95.00 percentile latency (ns)": 644809467121, "97.00 percentile latency (ns)": 650817850099, "99.00 percentile latency (ns)": 655792663867, "99.90 percentile latency (ns)": 657623549775, "Max latency (ns)": 657775386741, "Mean latency (ns)": 398783693939, "Min duration satisfied": "Yes", "Min latency (ns)": 361677811, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 501.691, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 501.691, "characteristics.samples_per_second.normalized_per_core": 501.691, "characteristics.samples_per_second.normalized_per_processor": 501.691, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 330000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_Triton", "system_name": "NVIDIA DGX-A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 500, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "24015abd05bcf3dc", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 527360837753, "90.00 percentile latency (ns)": 784281081539, "95.00 percentile latency (ns)": 809058247328, "97.00 percentile latency (ns)": 817963067778, "99.00 percentile latency (ns)": 825604040483, "99.90 percentile latency (ns)": 828326733966, "Max latency (ns)": 828667672200, "Mean latency (ns)": 498872032131, "Min duration satisfied": "Yes", "Min latency (ns)": 221252509, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 48.584, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 17.497624396135258, "characteristics.power.normalized_per_core": 17.497624396135258, "characteristics.power.normalized_per_processor": 17.497624396135258, "characteristics.samples_per_second": 48.584, "characteristics.samples_per_second.normalized_per_core": 48.584, "characteristics.samples_per_second.normalized_per_processor": 48.584, "ck_system": "AGX_Xavier_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 40260, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT_MaxQ", "system_name": "NVIDIA Jetson AGX Xavier 32GB (MaxQ, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 61, "task": "NLP", "task2": "nlp", "total_cores": 8, "uid": "4f7899544f12ac4d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 413509668762, "90.00 percentile latency (ns)": 611495732474, "95.00 percentile latency (ns)": 629098719139, "97.00 percentile latency (ns)": 634900052741, "99.00 percentile latency (ns)": 639736093749, "99.90 percentile latency (ns)": 641375439375, "Max latency (ns)": 641500487571, "Mean latency (ns)": 388840145561, "Min duration satisfied": "Yes", "Min latency (ns)": 678810014, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 339.516, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30 (1x1g.3gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 339.516, "characteristics.samples_per_second.normalized_per_core": 339.516, "characteristics.samples_per_second.normalized_per_processor": 339.516, "ck_system": "A30-MIG_1x1g.3gb_TRT", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30-MIG_1x1g.3gb_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 217800, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30-MIG_1x1g.3gb_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 330, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "b29890142440ab88", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 393464704286, "90.00 percentile latency (ns)": 582608712209, "95.00 percentile latency (ns)": 599460033806, "97.00 percentile latency (ns)": 605011318229, "99.00 percentile latency (ns)": 609614227624, "99.90 percentile latency (ns)": 611200908340, "Max latency (ns)": 611331197871, "Mean latency (ns)": 369669684873, "Min duration satisfied": "Yes", "Min latency (ns)": 670955110, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1079.61, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1079.61, "characteristics.samples_per_second.normalized_per_core": 1079.61, "characteristics.samples_per_second.normalized_per_processor": 1079.61, "ck_system": "A10x1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A10x1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 660000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x1_TRT_Triton", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x A10, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1000, "task": "NLP", "task2": "nlp", "total_cores": 56, "uid": "e393d871f54ad9bd", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 415824802546, "90.00 percentile latency (ns)": 615935450245, "95.00 percentile latency (ns)": 633586322910, "97.00 percentile latency (ns)": 639534585887, "99.00 percentile latency (ns)": 644462548239, "99.90 percentile latency (ns)": 646224858882, "Max latency (ns)": 646388856610, "Mean latency (ns)": 391507599076, "Min duration satisfied": "Yes", "Min latency (ns)": 1388996157, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 3573.7, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 3573.7, "characteristics.samples_per_second.normalized_per_core": 3573.7, "characteristics.samples_per_second.normalized_per_processor": 3573.7, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 2310000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3500, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "487f9619eac57751", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 398045840475, "90.00 percentile latency (ns)": 588926812107, "95.00 percentile latency (ns)": 605742578446, "97.00 percentile latency (ns)": 611383200747, "99.00 percentile latency (ns)": 615923970366, "99.90 percentile latency (ns)": 617420151827, "Max latency (ns)": 617544904262, "Mean latency (ns)": 373945726932, "Min duration satisfied": "Yes", "Min latency (ns)": 1195004934, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 1068.75, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1068.75, "characteristics.samples_per_second.normalized_per_core": 1068.75, "characteristics.samples_per_second.normalized_per_processor": 1068.75, "ck_system": "A10x1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A10x1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 660000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x1_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x A10, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1000, "task": "NLP", "task2": "nlp", "total_cores": 56, "uid": "f34db4125a5b25c3", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 511894414317, "90.00 percentile latency (ns)": 753097865419, "95.00 percentile latency (ns)": 774147884771, "97.00 percentile latency (ns)": 781249078262, "99.00 percentile latency (ns)": 786968762570, "99.90 percentile latency (ns)": 788683807219, "Max latency (ns)": 788839071941, "Mean latency (ns)": 481492556459, "Min duration satisfied": "Yes", "Min latency (ns)": 3090183112, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 1649.92, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1649.92, "characteristics.samples_per_second.normalized_per_core": 1649.92, "characteristics.samples_per_second.normalized_per_processor": 1649.92, "ck_system": "A30x1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30x1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1301519, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x1_TRT", "system_name": "Gigabyte G482-Z54 (1x A30, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 1972, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "9c3d4d344a29add3", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 413167737756, "90.00 percentile latency (ns)": 610920139073, "95.00 percentile latency (ns)": 628496679767, "97.00 percentile latency (ns)": 634285982762, "99.00 percentile latency (ns)": 639109779898, "99.90 percentile latency (ns)": 640743350146, "Max latency (ns)": 640868189696, "Mean latency (ns)": 388506421634, "Min duration satisfied": "Yes", "Min latency (ns)": 673287664, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 339.851, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30 (1x1g.3gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 339.851, "characteristics.samples_per_second.normalized_per_core": 339.851, "characteristics.samples_per_second.normalized_per_processor": 339.851, "ck_system": "A30-MIG_1x1g.3gb_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30-MIG_1x1g.3gb_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0, Triton 21.02; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 217800, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30-MIG_1x1g.3gb_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 330, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "b459b8044a286fcb", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 424752955914, "90.00 percentile latency (ns)": 626582429329, "95.00 percentile latency (ns)": 644094456135, "97.00 percentile latency (ns)": 649970380938, "99.00 percentile latency (ns)": 654854441621, "99.90 percentile latency (ns)": 656597969779, "Max latency (ns)": 656734847301, "Mean latency (ns)": 399305877935, "Min duration satisfied": "Yes", "Min latency (ns)": 642161574, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 502.486, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 502.486, "characteristics.samples_per_second.normalized_per_core": 502.486, "characteristics.samples_per_second.normalized_per_processor": 502.486, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 330000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT", "system_name": "NVIDIA DGX-A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 500, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "288a4cfca804032f", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 504879933938, "90.00 percentile latency (ns)": 743948928449, "95.00 percentile latency (ns)": 764349710979, "97.00 percentile latency (ns)": 771273250930, "99.00 percentile latency (ns)": 776897059302, "99.90 percentile latency (ns)": 778888881269, "Max latency (ns)": 779011285290, "Mean latency (ns)": 474687122372, "Min duration satisfied": "Yes", "Min latency (ns)": 2199107166, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 2880.57, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2880.57, "characteristics.samples_per_second.normalized_per_core": 2880.57, "characteristics.samples_per_second.normalized_per_processor": 2880.57, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 2244000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "8d84f8bb1c3c05ff", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 417183040449, "90.00 percentile latency (ns)": 631412309269, "95.00 percentile latency (ns)": 653854221928, "97.00 percentile latency (ns)": 662511426807, "99.00 percentile latency (ns)": 669904365978, "99.90 percentile latency (ns)": 672566650638, "Max latency (ns)": 672884711347, "Mean latency (ns)": 397665246706, "Min duration satisfied": "Yes", "Min latency (ns)": 98495662, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 51.0043, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 51.0043, "characteristics.samples_per_second.normalized_per_core": 51.0043, "characteristics.samples_per_second.normalized_per_processor": 51.0043, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "bert-99", "input_data_types": "int32", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 10833, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 34320, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 52, "task": "NLP", "task2": "nlp", "total_cores": 6, "uid": "a2a841b8990ad53d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 43503819267, "90.00 percentile latency (ns)": 64292065785, "95.00 percentile latency (ns)": 66237835906, "97.00 percentile latency (ns)": 66764994622, "99.00 percentile latency (ns)": 67186363017, "99.90 percentile latency (ns)": 67282083176, "Max latency (ns)": 67282197700, "Mean latency (ns)": 41098872137, "Min duration satisfied": "Yes", "Min latency (ns)": 1383937208, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 3335.21, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 3335.21, "characteristics.samples_per_second.normalized_per_core": 3335.21, "characteristics.samples_per_second.normalized_per_processor": 3335.21, "ck_system": "DGX-A100_A100-SXM4x1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 224400, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1_TRT", "system_name": "NVIDIA DGX-A100 (1x A100-SXM4, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "bd0a4c668c41c882", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 232205893251, "90.00 percentile latency (ns)": 418222099394, "95.00 percentile latency (ns)": 441176632555, "97.00 percentile latency (ns)": 450625476239, "99.00 percentile latency (ns)": 460047236034, "99.90 percentile latency (ns)": 464095490906, "Max latency (ns)": 464349860204, "Mean latency (ns)": 232320912672, "Min duration satisfied": "Yes", "Min latency (ns)": 305782507, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 434.931, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "5GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4 (1x1g.5gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 434.931, "characteristics.samples_per_second.normalized_per_core": 434.931, "characteristics.samples_per_second.normalized_per_processor": 434.931, "ck_system": "DGX-A100_A100-SXM4x1-MIG_1x1g.5gb_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1-MIG_1x1g.5gb_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 201960, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1-MIG_1x1g.5gb_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3060, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "0240ba85035afa20", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 36910436469, "90.00 percentile latency (ns)": 66610593462, "95.00 percentile latency (ns)": 70374547923, "97.00 percentile latency (ns)": 71883280488, "99.00 percentile latency (ns)": 73393680944, "99.90 percentile latency (ns)": 73854286533, "Max latency (ns)": 73854610987, "Mean latency (ns)": 37004980523, "Min duration satisfied": "Yes", "Min latency (ns)": 387590524, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 2734.56, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 2734.56, "characteristics.samples_per_second.normalized_per_core": 2734.56, "characteristics.samples_per_second.normalized_per_processor": 2734.56, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 201960, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z52 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3060, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "a4c86a6fec5f6bd6", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 170332592956, "90.00 percentile latency (ns)": 254609559310, "95.00 percentile latency (ns)": 263052964455, "97.00 percentile latency (ns)": 266183883739, "99.00 percentile latency (ns)": 268897504657, "99.90 percentile latency (ns)": 269900172108, "Max latency (ns)": 270019605232, "Mean latency (ns)": 161511738387, "Min duration satisfied": "Yes", "Min latency (ns)": 104278522, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 91.0156, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 91.0156, "characteristics.samples_per_second.normalized_per_core": 91.0156, "characteristics.samples_per_second.normalized_per_processor": 91.0156, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2", "host_memory_capacity": "32GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 24576, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 86, "task": "NLP", "task2": "nlp", "total_cores": 8, "uid": "9b6b458b9c3c8b73", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 31630866926, "90.00 percentile latency (ns)": 56976153316, "95.00 percentile latency (ns)": 60177934970, "97.00 percentile latency (ns)": 61458484826, "99.00 percentile latency (ns)": 62741663785, "99.90 percentile latency (ns)": 63135508106, "Max latency (ns)": 63135678626, "Mean latency (ns)": 31682659112, "Min duration satisfied": "Yes", "Min latency (ns)": 325631937, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 3198.83, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 3198.83, "characteristics.samples_per_second.normalized_per_core": 3198.83, "characteristics.samples_per_second.normalized_per_processor": 3198.83, "ck_system": "DGX-A100_A100-SXM4x1_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 201960, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3060, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "a8ac4494d8167287", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 312453689161, "90.00 percentile latency (ns)": 460655962474, "95.00 percentile latency (ns)": 473816699049, "97.00 percentile latency (ns)": 478245107496, "99.00 percentile latency (ns)": 481917889201, "99.90 percentile latency (ns)": 483251899613, "Max latency (ns)": 483332865320, "Mean latency (ns)": 293857806546, "Min duration satisfied": "Yes", "Min latency (ns)": 1268029380, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 464.276, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "5GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4 (1x1g.5gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 464.276, "characteristics.samples_per_second.normalized_per_core": 464.276, "characteristics.samples_per_second.normalized_per_processor": 464.276, "ck_system": "DGX-A100_A100-SXM4x1-MIG_1x1g.5gb_TRT", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1-MIG_1x1g.5gb_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 224400, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1-MIG_1x1g.5gb_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "638405bb7744c6bf", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 33737986752, "90.00 percentile latency (ns)": 60362562323, "95.00 percentile latency (ns)": 63988136142, "97.00 percentile latency (ns)": 65189155708, "99.00 percentile latency (ns)": 66389932451, "99.90 percentile latency (ns)": 66909980648, "Max latency (ns)": 66910033523, "Mean latency (ns)": 33690194757, "Min duration satisfied": "Yes", "Min latency (ns)": 625405991, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 424.152, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 424.152, "characteristics.samples_per_second.normalized_per_core": 424.152, "characteristics.samples_per_second.normalized_per_processor": 424.152, "ck_system": "T4x1_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 28380, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x1_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 430, "task": "NLP", "task2": "nlp", "total_cores": 240, "uid": "112ac2fed11e27e4", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 41627813106, "90.00 percentile latency (ns)": 61110401024, "95.00 percentile latency (ns)": 62992838348, "97.00 percentile latency (ns)": 63502923358, "99.00 percentile latency (ns)": 63903634943, "99.90 percentile latency (ns)": 64042184039, "Max latency (ns)": 64042201036, "Mean latency (ns)": 39396413384, "Min duration satisfied": "Yes", "Min latency (ns)": 2489052999, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 443.145, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 443.145, "characteristics.samples_per_second.normalized_per_core": 443.145, "characteristics.samples_per_second.normalized_per_processor": 443.145, "ck_system": "T4x1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 28380, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x1_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x T4, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 430, "task": "NLP", "task2": "nlp", "total_cores": 56, "uid": "07406e944b2bd196", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 50690177284, "90.00 percentile latency (ns)": 74980357905, "95.00 percentile latency (ns)": 77262908057, "97.00 percentile latency (ns)": 77881366615, "99.00 percentile latency (ns)": 78372527355, "99.90 percentile latency (ns)": 78484358347, "Max latency (ns)": 78484435855, "Mean latency (ns)": 47905787957, "Min duration satisfied": "Yes", "Min latency (ns)": 1618423758, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 2859.17, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 2859.17, "characteristics.samples_per_second.normalized_per_core": 2859.17, "characteristics.samples_per_second.normalized_per_processor": 2859.17, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 224400, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z52 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 3400, "task": "NLP", "task2": "nlp", "total_cores": 128, "uid": "42ab04852af8cacf", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 301392855188, "90.00 percentile latency (ns)": 455386662145, "95.00 percentile latency (ns)": 471660730549, "97.00 percentile latency (ns)": 477980669072, "99.00 percentile latency (ns)": 483373618581, "99.90 percentile latency (ns)": 485318751208, "Max latency (ns)": 485545745082, "Mean latency (ns)": 287131715756, "Min duration satisfied": "Yes", "Min latency (ns)": 94078790, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "BERT SERVER", "Samples per second": 50.6152, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 50.6152, "characteristics.samples_per_second.normalized_per_core": 50.6152, "characteristics.samples_per_second.normalized_per_processor": 50.6152, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "SQuAD v1.1", "dataset_link": "", "dim_x_default": "seq_number", "dim_x_maximize": true, "dim_y_default": "characteristics.samples_per_second", "dim_y_maximize": false, "division": "closed", "formal_model": "bert", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2", "host_memory_capacity": "8GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "bert-99", "input_data_types": "int32", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 10833, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 24576, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "bert_large_v1_1_fake_quant.onnx", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 0, "target_qps": 45, "task": "NLP", "task2": "nlp", "total_cores": 6, "uid": "8e478d836588b6ba", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" } ]