[ { "50.00 percentile": 1, "50.00 percentile latency (ns)": 23002332, "90.00 percentile": 1, "90.00 percentile latency (ns)": 39713566, "95.00 percentile": 1, "95.00 percentile latency (ns)": 41794730, "97.00 percentile": 1, "97.00 percentile latency (ns)": 42174931, "99.00 percentile": 1, "99.00 percentile latency (ns)": 42683398, "99.90 percentile": 2, "99.90 percentile latency (ns)": 44112876, "Max latency (ns)": 86309512, "Mean latency (ns)": 23089114, "Min duration satisfied": "Yes", "Min latency (ns)": 3392879, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 1520, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 22.912, "characteristics.samples_per_query": 1520, "characteristics.samples_per_query.normalized_per_core": 1520.0, "characteristics.samples_per_query.normalized_per_processor": 1520.0, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 1024, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1520, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "a772c14802bd369d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 42608150, "90.00 percentile": 1, "90.00 percentile latency (ns)": 62867317, "95.00 percentile": 1, "95.00 percentile latency (ns)": 63191721, "97.00 percentile": 1, "97.00 percentile latency (ns)": 63358358, "99.00 percentile": 1, "99.00 percentile latency (ns)": 63670658, "99.90 percentile": 1, "99.90 percentile latency (ns)": 64151151, "Max latency (ns)": 94585698, "Mean latency (ns)": 40135244, "Min duration satisfied": "Yes", "Min latency (ns)": 15602061, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 48, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 20.111, "characteristics.samples_per_query": 48, "characteristics.samples_per_query.normalized_per_core": 48.0, "characteristics.samples_per_query.normalized_per_processor": 48.0, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 64, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 48, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "8405f804ac0faca1", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 37602297, "90.00 percentile": 1, "90.00 percentile latency (ns)": 43288308, "95.00 percentile": 1, "95.00 percentile latency (ns)": 43414079, "97.00 percentile": 1, "97.00 percentile latency (ns)": 43495187, "99.00 percentile": 1, "99.00 percentile latency (ns)": 43633812, "99.90 percentile": 1, "99.90 percentile latency (ns)": 43830768, "Max latency (ns)": 47838623, "Mean latency (ns)": 38880376, "Min duration satisfied": "Yes", "Min latency (ns)": 36111029, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 107, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 22.891, "characteristics.samples_per_query": 107, "characteristics.samples_per_query.normalized_per_core": 107.0, "characteristics.samples_per_query.normalized_per_processor": 107.0, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 1024, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 107, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 8, "uid": "2cc74c61d4a5592b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 50939682, "90.00 percentile": 1, "90.00 percentile latency (ns)": 51098636, "95.00 percentile": 1, "95.00 percentile latency (ns)": 51163330, "97.00 percentile": 1, "97.00 percentile latency (ns)": 51209200, "99.00 percentile": 1, "99.00 percentile latency (ns)": 51296071, "99.90 percentile": 1, "99.90 percentile latency (ns)": 51453596, "Max latency (ns)": 52999555, "Mean latency (ns)": 50943927, "Min duration satisfied": "Yes", "Min latency (ns)": 50599644, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 2, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 20.111, "characteristics.samples_per_query": 2, "characteristics.samples_per_query.normalized_per_core": 2.0, "characteristics.samples_per_query.normalized_per_processor": 2.0, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "32 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 64, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 2, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 8, "uid": "716ca6a4241a59fd", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 45868259, "90.00 percentile": 1, "90.00 percentile latency (ns)": 46337186, "95.00 percentile": 1, "95.00 percentile latency (ns)": 46456371, "97.00 percentile": 1, "97.00 percentile latency (ns)": 46545881, "99.00 percentile": 1, "99.00 percentile latency (ns)": 46685942, "99.90 percentile": 2, "99.90 percentile latency (ns)": 53653005, "Max latency (ns)": 128075241, "Mean latency (ns)": 45882650, "Min duration satisfied": "Yes", "Min latency (ns)": 42346138, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 2368, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 22.911, "characteristics.samples_per_query": 2368, "characteristics.samples_per_query.normalized_per_core": 2368.0, "characteristics.samples_per_query.normalized_per_processor": 2368.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_edge", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_edge", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 1024, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 2368, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_edge", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "45345e0afacab341", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 61572140, "90.00 percentile": 1, "90.00 percentile latency (ns)": 61900227, "95.00 percentile": 1, "95.00 percentile latency (ns)": 62049649, "97.00 percentile": 1, "97.00 percentile latency (ns)": 62092277, "99.00 percentile": 1, "99.00 percentile latency (ns)": 62250139, "99.90 percentile": 1, "99.90 percentile latency (ns)": 63137672, "Max latency (ns)": 159012397, "Mean latency (ns)": 61605764, "Min duration satisfied": "Yes", "Min latency (ns)": 59560969, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 60, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 20.111, "characteristics.samples_per_query": 60, "characteristics.samples_per_query.normalized_per_core": 60.0, "characteristics.samples_per_query.normalized_per_processor": 60.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_edge", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_edge", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 64, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 60, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_edge", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "a96a7f2ffdded4f5", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 30047019, "90.00 percentile": 1, "90.00 percentile latency (ns)": 47645781, "95.00 percentile": 1, "95.00 percentile latency (ns)": 48005457, "97.00 percentile": 1, "97.00 percentile latency (ns)": 48169337, "99.00 percentile": 1, "99.00 percentile latency (ns)": 48458293, "99.90 percentile": 2, "99.90 percentile latency (ns)": 49736911, "Max latency (ns)": 227875008, "Mean latency (ns)": 30024918, "Min duration satisfied": "Yes", "Min latency (ns)": 10826627, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 1920, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 22.912, "characteristics.samples_per_query": 1920, "characteristics.samples_per_query.normalized_per_core": 1920.0, "characteristics.samples_per_query.normalized_per_processor": 1920.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 1024, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1920, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "5ab42c5a6e3b9510", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 44034976, "90.00 percentile": 1, "90.00 percentile latency (ns)": 59476205, "95.00 percentile": 1, "95.00 percentile latency (ns)": 59580477, "97.00 percentile": 1, "97.00 percentile latency (ns)": 59660664, "99.00 percentile": 1, "99.00 percentile latency (ns)": 59775264, "99.90 percentile": 1, "99.90 percentile latency (ns)": 60054833, "Max latency (ns)": 274456275, "Mean latency (ns)": 37376471, "Min duration satisfied": "Yes", "Min latency (ns)": 14833111, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 56, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 20.111, "characteristics.samples_per_query": 56, "characteristics.samples_per_query.normalized_per_core": 56.0, "characteristics.samples_per_query.normalized_per_processor": 56.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 64, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 56, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_Triton_edge", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "ec63324db4b05022", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 23415442, "90.00 percentile": 1, "90.00 percentile latency (ns)": 41560788, "95.00 percentile": 1, "95.00 percentile latency (ns)": 44532231, "97.00 percentile": 1, "97.00 percentile latency (ns)": 44803627, "99.00 percentile": 1, "99.00 percentile latency (ns)": 45108888, "99.90 percentile": 2, "99.90 percentile latency (ns)": 46393285, "Max latency (ns)": 99510025, "Mean latency (ns)": 24576918, "Min duration satisfied": "Yes", "Min latency (ns)": 3886327, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 1540, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 22.912, "characteristics.samples_per_query": 1540, "characteristics.samples_per_query.normalized_per_core": 1540.0, "characteristics.samples_per_query.normalized_per_processor": 1540.0, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 1024, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1540, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "c2c8894bcf807b77", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 52795106, "90.00 percentile": 1, "90.00 percentile latency (ns)": 62239930, "95.00 percentile": 1, "95.00 percentile latency (ns)": 62460249, "97.00 percentile": 1, "97.00 percentile latency (ns)": 62626720, "99.00 percentile": 1, "99.00 percentile latency (ns)": 62838226, "99.90 percentile": 2, "99.90 percentile latency (ns)": 64614516, "Max latency (ns)": 82113326, "Mean latency (ns)": 46301860, "Min duration satisfied": "Yes", "Min latency (ns)": 27052185, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 48, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 20.111, "characteristics.samples_per_query": 48, "characteristics.samples_per_query.normalized_per_core": 48.0, "characteristics.samples_per_query.normalized_per_processor": 48.0, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 64, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 48, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z54 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "343b4b3a7c4feb92", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 37154863, "90.00 percentile": 1, "90.00 percentile latency (ns)": 44709194, "95.00 percentile": 1, "95.00 percentile latency (ns)": 44778897, "97.00 percentile": 1, "97.00 percentile latency (ns)": 44822072, "99.00 percentile": 1, "99.00 percentile latency (ns)": 44905064, "99.90 percentile": 1, "99.90 percentile latency (ns)": 45632447, "Max latency (ns)": 75245419, "Mean latency (ns)": 39540371, "Min duration satisfied": "Yes", "Min latency (ns)": 35839762, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 60, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 22.933, "characteristics.samples_per_query": 60, "characteristics.samples_per_query.normalized_per_core": 60.0, "characteristics.samples_per_query.normalized_per_processor": 60.0, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 1024, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 60, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 6, "uid": "a03285746d0b3f0d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 43082231, "90.00 percentile": 1, "90.00 percentile latency (ns)": 43146610, "95.00 percentile": 1, "95.00 percentile latency (ns)": 43167245, "97.00 percentile": 1, "97.00 percentile latency (ns)": 43182517, "99.00 percentile": 1, "99.00 percentile latency (ns)": 43224107, "99.90 percentile": 1, "99.90 percentile latency (ns)": 43770863, "Max latency (ns)": 79365119, "Mean latency (ns)": 43087967, "Min duration satisfied": "Yes", "Min latency (ns)": 42896614, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 1, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.mAP": 20.111, "characteristics.samples_per_query": 1, "characteristics.samples_per_query.normalized_per_core": 1.0, "characteristics.samples_per_query.normalized_per_processor": 1.0, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2", "host_memory_capacity": "8 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32 GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 270336, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "21.03 Jetson CUDA-X AI Developer Preview, TensorRT 7.2.3, CUDA 10.2, cuDNN 8.0.0, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 64, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 6, "uid": "f92e000d1b14e959", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 44920523, "90.00 percentile": 1, "90.00 percentile latency (ns)": 45982449, "95.00 percentile": 1, "95.00 percentile latency (ns)": 46339383, "97.00 percentile": 1, "97.00 percentile latency (ns)": 46517355, "99.00 percentile": 1, "99.00 percentile latency (ns)": 47373960, "99.90 percentile": 2, "99.90 percentile latency (ns)": 59950208, "Max latency (ns)": 144348795, "Mean latency (ns)": 45188631, "Min duration satisfied": "Yes", "Min latency (ns)": 44180511, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 2368, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.911, "characteristics.samples_per_query": 2368, "characteristics.samples_per_query.normalized_per_core": 2368.0, "characteristics.samples_per_query.normalized_per_processor": 2368.0, "ck_system": "DGX-A100_A100-SXM4x1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 2368, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1_TRT", "system_name": "NVIDIA DGX-A100 (1x A100-SXM4, TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "20e196c65cc74774", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 60587447, "90.00 percentile": 1, "90.00 percentile latency (ns)": 60862086, "95.00 percentile": 1, "95.00 percentile latency (ns)": 61191536, "97.00 percentile": 1, "97.00 percentile latency (ns)": 61215731, "99.00 percentile": 1, "99.00 percentile latency (ns)": 61255037, "99.90 percentile": 1, "99.90 percentile latency (ns)": 62088387, "Max latency (ns)": 110880100, "Mean latency (ns)": 60625381, "Min duration satisfied": "Yes", "Min latency (ns)": 59659378, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 60, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 60, "characteristics.samples_per_query.normalized_per_core": 60.0, "characteristics.samples_per_query.normalized_per_processor": 60.0, "ck_system": "DGX-A100_A100-SXM4x1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 60, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1_TRT", "system_name": "NVIDIA DGX-A100 (1x A100-SXM4, TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "46ff4b482c8aa2de", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 24132998, "90.00 percentile": 1, "90.00 percentile latency (ns)": 40707091, "95.00 percentile": 1, "95.00 percentile latency (ns)": 42775738, "97.00 percentile": 1, "97.00 percentile latency (ns)": 43379999, "99.00 percentile": 1, "99.00 percentile latency (ns)": 43747674, "99.90 percentile": 2, "99.90 percentile latency (ns)": 44586888, "Max latency (ns)": 72316220, "Mean latency (ns)": 23650659, "Min duration satisfied": "Yes", "Min latency (ns)": 3403888, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 1600, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.912, "characteristics.samples_per_query": 1600, "characteristics.samples_per_query.normalized_per_core": 1600.0, "characteristics.samples_per_query.normalized_per_processor": 1600.0, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 1600, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z52 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "d3d3eb0041658662", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 44058519, "90.00 percentile": 1, "90.00 percentile latency (ns)": 61750745, "95.00 percentile": 1, "95.00 percentile latency (ns)": 62265186, "97.00 percentile": 1, "97.00 percentile latency (ns)": 62509787, "99.00 percentile": 1, "99.00 percentile latency (ns)": 62871058, "99.90 percentile": 1, "99.90 percentile latency (ns)": 63256554, "Max latency (ns)": 96007461, "Mean latency (ns)": 39691714, "Min duration satisfied": "Yes", "Min latency (ns)": 16458222, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 48, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 48, "characteristics.samples_per_query.normalized_per_core": 48.0, "characteristics.samples_per_query.normalized_per_processor": 48.0, "ck_system": "A100-PCIex1_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 48, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT_Triton", "system_name": "Gigabyte G482-Z52 (1x A100-PCIe, TensorRT, Triton)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "8ac9e2464fc24872", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 41334749, "90.00 percentile": 1, "90.00 percentile latency (ns)": 43010888, "95.00 percentile": 1, "95.00 percentile latency (ns)": 43218718, "97.00 percentile": 1, "97.00 percentile latency (ns)": 43323375, "99.00 percentile": 1, "99.00 percentile latency (ns)": 43492381, "99.90 percentile": 1, "99.90 percentile latency (ns)": 43798531, "Max latency (ns)": 49064326, "Mean latency (ns)": 41637535, "Min duration satisfied": "Yes", "Min latency (ns)": 40128261, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 107, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.911, "characteristics.samples_per_query": 107, "characteristics.samples_per_query.normalized_per_core": 107.0, "characteristics.samples_per_query.normalized_per_processor": 107.0, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2", "host_memory_capacity": "32GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 107, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 8, "uid": "4bba508e04ee2eec", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 55803619, "90.00 percentile": 1, "90.00 percentile latency (ns)": 56048109, "95.00 percentile": 1, "95.00 percentile latency (ns)": 56127683, "97.00 percentile": 1, "97.00 percentile latency (ns)": 56181937, "99.00 percentile": 1, "99.00 percentile latency (ns)": 56289802, "99.90 percentile": 1, "99.90 percentile latency (ns)": 56488576, "Max latency (ns)": 57899630, "Mean latency (ns)": 55819976, "Min duration satisfied": "Yes", "Min latency (ns)": 55298666, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 2, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA AGX Xavier", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 2, "characteristics.samples_per_query.normalized_per_core": 2.0, "characteristics.samples_per_query.normalized_per_processor": 2.0, "ck_system": "AGX_Xavier_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2", "host_memory_capacity": "32GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 8, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32GB", "host_storage_type": "eMMC 5.1", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/AGX_Xavier_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 2, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/AGX_Xavier_TRT", "system_name": "NVIDIA Jetson AGX Xavier 32GB (TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 8, "uid": "cdc124f8c8272ae3", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 29497351, "90.00 percentile": 1, "90.00 percentile latency (ns)": 44895216, "95.00 percentile": 1, "95.00 percentile latency (ns)": 44995322, "97.00 percentile": 1, "97.00 percentile latency (ns)": 45048794, "99.00 percentile": 1, "99.00 percentile latency (ns)": 45145878, "99.90 percentile": 2, "99.90 percentile latency (ns)": 45500671, "Max latency (ns)": 82899380, "Mean latency (ns)": 29507358, "Min duration satisfied": "Yes", "Min latency (ns)": 13690321, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 1920, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.912, "characteristics.samples_per_query": 1920, "characteristics.samples_per_query.normalized_per_core": 1920.0, "characteristics.samples_per_query.normalized_per_processor": 1920.0, "ck_system": "DGX-A100_A100-SXM4x1_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 1920, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "40b50d44f1691ef4", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 46479882, "90.00 percentile": 1, "90.00 percentile latency (ns)": 61747654, "95.00 percentile": 1, "95.00 percentile latency (ns)": 61826551, "97.00 percentile": 1, "97.00 percentile latency (ns)": 61878574, "99.00 percentile": 1, "99.00 percentile latency (ns)": 61969883, "99.90 percentile": 2, "99.90 percentile latency (ns)": 64650640, "Max latency (ns)": 297373922, "Mean latency (ns)": 39759068, "Min duration satisfied": "Yes", "Min latency (ns)": 17251538, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 56, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 56, "characteristics.samples_per_query.normalized_per_core": 56.0, "characteristics.samples_per_query.normalized_per_processor": 56.0, "ck_system": "DGX-A100_A100-SXM4x1_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 56, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x1_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "20b1d8ed504c8e51", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 27726238, "90.00 percentile": 1, "90.00 percentile latency (ns)": 45958091, "95.00 percentile": 1, "95.00 percentile latency (ns)": 47091773, "97.00 percentile": 1, "97.00 percentile latency (ns)": 47465072, "99.00 percentile": 1, "99.00 percentile latency (ns)": 47897205, "99.90 percentile": 1, "99.90 percentile latency (ns)": 48072213, "Max latency (ns)": 75595152, "Mean latency (ns)": 30242855, "Min duration satisfied": "Yes", "Min latency (ns)": 11527943, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 360, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.911, "characteristics.samples_per_query": 360, "characteristics.samples_per_query.normalized_per_core": 360.0, "characteristics.samples_per_query.normalized_per_processor": 360.0, "ck_system": "T4x1_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 360, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x1_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "24ed6f48dcbda186", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 41537228, "90.00 percentile": 1, "90.00 percentile latency (ns)": 59706732, "95.00 percentile": 1, "95.00 percentile latency (ns)": 60286721, "97.00 percentile": 1, "97.00 percentile latency (ns)": 60426151, "99.00 percentile": 1, "99.00 percentile latency (ns)": 60860128, "99.90 percentile": 1, "99.90 percentile latency (ns)": 61704295, "Max latency (ns)": 106767005, "Mean latency (ns)": 37495380, "Min duration satisfied": "Yes", "Min latency (ns)": 14320303, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per query": 8, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 8, "characteristics.samples_per_query.normalized_per_core": 8.0, "characteristics.samples_per_query.normalized_per_processor": 8.0, "ck_system": "T4x1_TRT_Triton", "ck_used": true, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x1_TRT_Triton", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0, Triton 20.09; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 8, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x1_TRT_Triton", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 240, "uid": "19adc89ee3dfc53a", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 28065926, "90.00 percentile": 1, "90.00 percentile latency (ns)": 47320823, "95.00 percentile": 1, "95.00 percentile latency (ns)": 48172720, "97.00 percentile": 1, "97.00 percentile latency (ns)": 48269909, "99.00 percentile": 1, "99.00 percentile latency (ns)": 48403583, "99.90 percentile": 1, "99.90 percentile latency (ns)": 49048233, "Max latency (ns)": 75009631, "Mean latency (ns)": 30766617, "Min duration satisfied": "Yes", "Min latency (ns)": 11260596, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 368, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.911, "characteristics.samples_per_query": 368, "characteristics.samples_per_query.normalized_per_core": 368.0, "characteristics.samples_per_query.normalized_per_processor": 368.0, "ck_system": "T4x1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 368, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x1_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x T4, TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 56, "uid": "55bf4c6db7f0f501", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 52660157, "90.00 percentile": 1, "90.00 percentile latency (ns)": 57732924, "95.00 percentile": 1, "95.00 percentile latency (ns)": 57997363, "97.00 percentile": 1, "97.00 percentile latency (ns)": 58137933, "99.00 percentile": 1, "99.00 percentile latency (ns)": 58642734, "99.90 percentile": 1, "99.90 percentile latency (ns)": 58955061, "Max latency (ns)": 103274423, "Mean latency (ns)": 42935932, "Min duration satisfied": "Yes", "Min latency (ns)": 23723031, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 8, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 8, "characteristics.samples_per_query.normalized_per_core": 8.0, "characteristics.samples_per_query.normalized_per_processor": 8.0, "ck_system": "T4x1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 8, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x1_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (1x T4, TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 56, "uid": "0b10ab94d76d9a68", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 27214465, "90.00 percentile": 1, "90.00 percentile latency (ns)": 45566816, "95.00 percentile": 1, "95.00 percentile latency (ns)": 46675785, "97.00 percentile": 1, "97.00 percentile latency (ns)": 46913489, "99.00 percentile": 1, "99.00 percentile latency (ns)": 47296454, "99.90 percentile": 2, "99.90 percentile latency (ns)": 48507210, "Max latency (ns)": 73327964, "Mean latency (ns)": 25834098, "Min duration satisfied": "Yes", "Min latency (ns)": 3713856, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 1600, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.912, "characteristics.samples_per_query": 1600, "characteristics.samples_per_query.normalized_per_core": 1600.0, "characteristics.samples_per_query.normalized_per_processor": 1600.0, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 1600, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z52 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "8b116e7ca6cc8d95", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 55847761, "90.00 percentile": 1, "90.00 percentile latency (ns)": 60463846, "95.00 percentile": 1, "95.00 percentile latency (ns)": 60733811, "97.00 percentile": 1, "97.00 percentile latency (ns)": 60879763, "99.00 percentile": 1, "99.00 percentile latency (ns)": 61113996, "99.90 percentile": 1, "99.90 percentile latency (ns)": 61543530, "Max latency (ns)": 93241229, "Mean latency (ns)": 44830407, "Min duration satisfied": "Yes", "Min latency (ns)": 26865886, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 48, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 48, "characteristics.samples_per_query.normalized_per_core": 48.0, "characteristics.samples_per_query.normalized_per_processor": 48.0, "ck_system": "A100-PCIex1_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex1_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 48, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex1_TRT", "system_name": "Gigabyte G482-Z52 (1x A100-PCIe, TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 128, "uid": "13b32500c5d69b20", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 42673743, "90.00 percentile": 1, "90.00 percentile latency (ns)": 45486706, "95.00 percentile": 1, "95.00 percentile latency (ns)": 45540987, "97.00 percentile": 1, "97.00 percentile latency (ns)": 45571477, "99.00 percentile": 1, "99.00 percentile latency (ns)": 45649757, "99.90 percentile": 1, "99.90 percentile latency (ns)": 48779623, "Max latency (ns)": 121867020, "Mean latency (ns)": 43359462, "Min duration satisfied": "Yes", "Min latency (ns)": 39863048, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 60, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 22.926, "characteristics.samples_per_query": 60, "characteristics.samples_per_query.normalized_per_core": 60.0, "characteristics.samples_per_query.normalized_per_processor": 60.0, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2", "host_memory_capacity": "8GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-mobilenet", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 1024, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 60, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 50000000, "target_qps": 20, "task": "object detection", "task2": "object detection", "total_cores": 6, "uid": "a4e5988ff1ef4400", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile": 1, "50.00 percentile latency (ns)": 47042784, "90.00 percentile": 1, "90.00 percentile latency (ns)": 47165890, "95.00 percentile": 1, "95.00 percentile latency (ns)": 47207918, "97.00 percentile": 1, "97.00 percentile latency (ns)": 47235188, "99.00 percentile": 1, "99.00 percentile latency (ns)": 47291350, "99.90 percentile": 1, "99.90 percentile latency (ns)": 47581292, "Max latency (ns)": 89006130, "Mean latency (ns)": 47035985, "Min duration satisfied": "Yes", "Min latency (ns)": 46683685, "Min queries satisfied": "Yes", "Mode": "Performance", "Per-sample latency": "", "Performance constraints satisfied": "Yes", "Result is": "VALID", "SUT name": "LWIS_Server", "Samples per query": 1, "Scenario": "multistream", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "Shared with host", "accelerator_memory_configuration": "SRAM", "accelerator_model_name": "NVIDIA Xavier NX", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.mAP": 20.111, "characteristics.samples_per_query": 1, "characteristics.samples_per_query.normalized_per_core": 1.0, "characteristics.samples_per_query.normalized_per_processor": 1.0, "ck_system": "Xavier_NX_TRT", "ck_used": false, "cooling": "", "dataset": "COCO 2017 (300x300)", "dataset_link": "https://github.com/ctuning/ck/blob/master/docs/mlperf-automation/datasets/coco2017.md", "dim_x_default": "characteristics.samples_per_query", "dim_y_default": "characteristics.mAP", "dim_y_maximize": true, "division": "closed", "formal_model": "ssd-mobilenet", "formal_model_accuracy": 99.0, "formal_model_link": "https://github.com/mlcommons/ck-mlops/tree/main/package", "framework": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2", "host_memory_capacity": "8GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 6, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "NVIDIA Carmel (ARMv8.2)", "host_processors_per_node": 1, "host_storage_capacity": "32GB", "host_storage_type": "Micro SD Card", "hw_notes": "GPU and both DLAs are used in resnet50, ssd-mobilenet, and ssd-resnet34, in Offline and MultiStream scenarios", "informal_model": "ssd-resnet34", "input_data_types": "int8", "key.accuracy": "characteristics.mAP", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 270336, "mlperf_version": 0.7, "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/Xavier_NX_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "20.09 Jetson CUDA-X AI Developer Preview, TensorRT 7.2, CUDA 10.2, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 64, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 1, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "resnet34-ssd1200.pytorch", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/Xavier_NX_TRT", "system_name": "NVIDIA Jetson Xavier NX (TensorRT)", "system_type": "edge", "target_latency (ns)": 66666666, "target_qps": 15, "task": "object detection", "task2": "object detection", "total_cores": 6, "uid": "78b73d745fca44bb", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" } ]