[ { "50.00 percentile latency (ns)": 332591076882, "90.00 percentile latency (ns)": 599336495346, "95.00 percentile latency (ns)": 632686734181, "97.00 percentile latency (ns)": 646027773819, "99.00 percentile latency (ns)": 659364958582, "99.90 percentile latency (ns)": 665369867143, "Max latency (ns)": 666039511400, "Mean latency (ns)": 15047547427, "Min duration satisfied": "Yes", "Min latency (ns)": 10720862, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 784818, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 784818, "characteristics.samples_per_second.normalized_per_core": 98102.25, "characteristics.samples_per_second.normalized_per_processor": 98102.25, "ck_system": "A10x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A10x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 522720000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT_Triton", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "55afd7944fbb9140", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332591076882, "90.00 percentile latency (ns)": 599336495346, "95.00 percentile latency (ns)": 632686734181, "97.00 percentile latency (ns)": 646027773819, "99.00 percentile latency (ns)": 659364958582, "99.90 percentile latency (ns)": 665369867143, "Max latency (ns)": 666039511400, "Mean latency (ns)": 15047547427, "Min duration satisfied": "Yes", "Min latency (ns)": 10720862, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 784818, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 784818, "characteristics.samples_per_second.normalized_per_core": 98102.25, "characteristics.samples_per_second.normalized_per_processor": 98102.25, "ck_system": "A10x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A10x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 522720000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT_Triton", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "01d54f8884511b6c", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 312336922545, "90.00 percentile latency (ns)": 565239497899, "95.00 percentile latency (ns)": 596947319600, "97.00 percentile latency (ns)": 609635740013, "99.00 percentile latency (ns)": 622328882387, "99.90 percentile latency (ns)": 628053568839, "Max latency (ns)": 628689851179, "Mean latency (ns)": 5499777768, "Min duration satisfied": "Yes", "Min latency (ns)": 9316327, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1049800.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1049800.0, "characteristics.samples_per_second.normalized_per_core": 262450.0, "characteristics.samples_per_second.normalized_per_processor": 262450.0, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 660000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT_Triton", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "42a8afc03b215abc", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 312336922545, "90.00 percentile latency (ns)": 565239497899, "95.00 percentile latency (ns)": 596947319600, "97.00 percentile latency (ns)": 609635740013, "99.00 percentile latency (ns)": 622328882387, "99.90 percentile latency (ns)": 628053568839, "Max latency (ns)": 628689851179, "Mean latency (ns)": 5499777768, "Min duration satisfied": "Yes", "Min latency (ns)": 9316327, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1049800.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1049800.0, "characteristics.samples_per_second.normalized_per_core": 262450.0, "characteristics.samples_per_second.normalized_per_processor": 262450.0, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 660000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT_Triton", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "595e9682094988d3", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 343565603823, "90.00 percentile latency (ns)": 597828082533, "95.00 percentile latency (ns)": 629595490605, "97.00 percentile latency (ns)": 642303543519, "99.00 percentile latency (ns)": 655011881876, "99.90 percentile latency (ns)": 660729015507, "Max latency (ns)": 661365205369, "Mean latency (ns)": 13275035240, "Min duration satisfied": "Yes", "Min latency (ns)": 26620359366, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1097730.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1097730.0, "characteristics.samples_per_second.normalized_per_core": 274432.5, "characteristics.samples_per_second.normalized_per_processor": 274432.5, "ck_system": "A100-PCIe-80GB_aarch64x4_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.2, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 80, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Ampere Altra Q80-30", "host_processors_per_node": 1, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GB_aarch64x4_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.2, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 726000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GB_aarch64x4_TRT", "system_name": "Gigabyte G242-P31 (4x A100-PCIe-80GB_aarch64, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1100000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 80, "uid": "ead025a64104be4b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 343565603823, "90.00 percentile latency (ns)": 597828082533, "95.00 percentile latency (ns)": 629595490605, "97.00 percentile latency (ns)": 642303543519, "99.00 percentile latency (ns)": 655011881876, "99.90 percentile latency (ns)": 660729015507, "Max latency (ns)": 661365205369, "Mean latency (ns)": 13275035240, "Min duration satisfied": "Yes", "Min latency (ns)": 26620359366, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1097730.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1097730.0, "characteristics.samples_per_second.normalized_per_core": 274432.5, "characteristics.samples_per_second.normalized_per_processor": 274432.5, "ck_system": "A100-PCIe-80GB_aarch64x4_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.2, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 80, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Ampere Altra Q80-30", "host_processors_per_node": 1, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GB_aarch64x4_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.2, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 726000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GB_aarch64x4_TRT", "system_name": "Gigabyte G242-P31 (4x A100-PCIe-80GB_aarch64, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1100000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 80, "uid": "30b40801342ce807", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 337059126096, "90.00 percentile latency (ns)": 604528375440, "95.00 percentile latency (ns)": 637960082007, "97.00 percentile latency (ns)": 651332736675, "99.00 percentile latency (ns)": 664711679234, "99.90 percentile latency (ns)": 676814734844, "Max latency (ns)": 682171606052, "Mean latency (ns)": 12666630285, "Min duration satisfied": "Yes", "Min latency (ns)": 3017106616, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1083600.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1083600.0, "characteristics.samples_per_second.normalized_per_core": 135450.0, "characteristics.samples_per_second.normalized_per_processor": 135450.0, "ck_system": "A30x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 739200000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "44d4423bc83bb5cb", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 337059126096, "90.00 percentile latency (ns)": 604528375440, "95.00 percentile latency (ns)": 637960082007, "97.00 percentile latency (ns)": 651332736675, "99.00 percentile latency (ns)": 664711679234, "99.90 percentile latency (ns)": 676814734844, "Max latency (ns)": 682171606052, "Mean latency (ns)": 12666630285, "Min duration satisfied": "Yes", "Min latency (ns)": 3017106616, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1083600.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1083600.0, "characteristics.samples_per_second.normalized_per_core": 135450.0, "characteristics.samples_per_second.normalized_per_processor": 135450.0, "ck_system": "A30x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 739200000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "3e24e127645f6a84", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 336688054324, "90.00 percentile latency (ns)": 606287395813, "95.00 percentile latency (ns)": 639979354348, "97.00 percentile latency (ns)": 653455304598, "99.00 percentile latency (ns)": 666932934105, "99.90 percentile latency (ns)": 672996421125, "Max latency (ns)": 673667710542, "Mean latency (ns)": 12281048079, "Min duration satisfied": "Yes", "Min latency (ns)": 13274497, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1097280.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1097280.0, "characteristics.samples_per_second.normalized_per_core": 137160.0, "characteristics.samples_per_second.normalized_per_processor": 137160.0, "ck_system": "A30x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 739200000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "f4c73f240a5a535d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 336688054324, "90.00 percentile latency (ns)": 606287395813, "95.00 percentile latency (ns)": 639979354348, "97.00 percentile latency (ns)": 653455304598, "99.00 percentile latency (ns)": 666932934105, "99.90 percentile latency (ns)": 672996421125, "Max latency (ns)": 673667710542, "Mean latency (ns)": 12281048079, "Min duration satisfied": "Yes", "Min latency (ns)": 13274497, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1097280.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1097280.0, "characteristics.samples_per_second.normalized_per_core": 137160.0, "characteristics.samples_per_second.normalized_per_processor": 137160.0, "ck_system": "A30x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 739200000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "a1f77c4de3957b64", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 329353411493, "90.00 percentile latency (ns)": 588702107632, "95.00 percentile latency (ns)": 621112837068, "97.00 percentile latency (ns)": 634076557184, "99.00 percentile latency (ns)": 647479923467, "99.90 percentile latency (ns)": 657531689007, "Max latency (ns)": 660971669091, "Mean latency (ns)": 10706045692, "Min duration satisfied": "Yes", "Min latency (ns)": 6107576006, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2276650.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2276650.0, "characteristics.samples_per_second.normalized_per_core": 284581.25, "characteristics.samples_per_second.normalized_per_processor": 284581.25, "ck_system": "A100-PCIe-80GBx8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GBx8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1504800000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GBx8_TRT", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2280000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "88db73bbf8d518b7", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 329353411493, "90.00 percentile latency (ns)": 588702107632, "95.00 percentile latency (ns)": 621112837068, "97.00 percentile latency (ns)": 634076557184, "99.00 percentile latency (ns)": 647479923467, "99.90 percentile latency (ns)": 657531689007, "Max latency (ns)": 660971669091, "Mean latency (ns)": 10706045692, "Min duration satisfied": "Yes", "Min latency (ns)": 6107576006, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2276650.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2276650.0, "characteristics.samples_per_second.normalized_per_core": 284581.25, "characteristics.samples_per_second.normalized_per_processor": 284581.25, "ck_system": "A100-PCIe-80GBx8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GBx8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1504800000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GBx8_TRT", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2280000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "71d08342c6be1886", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 339562306420, "90.00 percentile latency (ns)": 591232438527, "95.00 percentile latency (ns)": 622689921689, "97.00 percentile latency (ns)": 635277582294, "99.00 percentile latency (ns)": 647866663345, "99.90 percentile latency (ns)": 653526412572, "Max latency (ns)": 654155946367, "Mean latency (ns)": 1876048166, "Min duration satisfied": "Yes", "Min latency (ns)": 25495981747, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2421440.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2421440.0, "characteristics.samples_per_second.normalized_per_core": 302680.0, "characteristics.samples_per_second.normalized_per_processor": 302680.0, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1584000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT", "system_name": "NVIDIA DGX A100 (8x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "874b54b09824e468", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 339562306420, "90.00 percentile latency (ns)": 591232438527, "95.00 percentile latency (ns)": 622689921689, "97.00 percentile latency (ns)": 635277582294, "99.00 percentile latency (ns)": 647866663345, "99.90 percentile latency (ns)": 653526412572, "Max latency (ns)": 654155946367, "Mean latency (ns)": 1876048166, "Min duration satisfied": "Yes", "Min latency (ns)": 25495981747, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2421440.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2421440.0, "characteristics.samples_per_second.normalized_per_core": 302680.0, "characteristics.samples_per_second.normalized_per_processor": 302680.0, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1584000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT", "system_name": "NVIDIA DGX A100 (8x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "b288429693731646", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 311150686149, "90.00 percentile latency (ns)": 560477525168, "95.00 percentile latency (ns)": 591815179554, "97.00 percentile latency (ns)": 604473013905, "99.00 percentile latency (ns)": 617157208480, "99.90 percentile latency (ns)": 626184679228, "Max latency (ns)": 628706049955, "Mean latency (ns)": 4511904119, "Min duration satisfied": "Yes", "Min latency (ns)": 4320079337, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1049780.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1049780.0, "characteristics.samples_per_second.normalized_per_core": 262445.0, "characteristics.samples_per_second.normalized_per_processor": 262445.0, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 660000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "7838e4810015833b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 311150686149, "90.00 percentile latency (ns)": 560477525168, "95.00 percentile latency (ns)": 591815179554, "97.00 percentile latency (ns)": 604473013905, "99.00 percentile latency (ns)": 617157208480, "99.90 percentile latency (ns)": 626184679228, "Max latency (ns)": 628706049955, "Mean latency (ns)": 4511904119, "Min duration satisfied": "Yes", "Min latency (ns)": 4320079337, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1049780.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1049780.0, "characteristics.samples_per_second.normalized_per_core": 262445.0, "characteristics.samples_per_second.normalized_per_processor": 262445.0, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 660000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "bb520089b48db9e4", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 329737154818, "90.00 percentile latency (ns)": 593055724826, "95.00 percentile latency (ns)": 625929153975, "97.00 percentile latency (ns)": 639084848500, "99.00 percentile latency (ns)": 652233177429, "99.90 percentile latency (ns)": 658152172615, "Max latency (ns)": 658816178273, "Mean latency (ns)": 329686797734, "Min duration satisfied": "Yes", "Min latency (ns)": 485424657, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 31172.9, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30 (1x1g.6gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 31172.9, "characteristics.samples_per_second.normalized_per_core": 31172.9, "characteristics.samples_per_second.normalized_per_processor": 31172.9, "ck_system": "A30-MIG_1x1g.6gb_TRT_HeteroMultiUse", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30-MIG_1x1g.6gb_TRT_HeteroMultiUse", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 20537220, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30-MIG_1x1g.6gb_TRT_HeteroMultiUse", "system_name": "Gigabyte G482-Z54 (1x A30-MIG-1x1g.6gb, TensorRT, HeteroMultiUse)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 31117, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "710baf2415b80c92", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 329737154818, "90.00 percentile latency (ns)": 593055724826, "95.00 percentile latency (ns)": 625929153975, "97.00 percentile latency (ns)": 639084848500, "99.00 percentile latency (ns)": 652233177429, "99.90 percentile latency (ns)": 658152172615, "Max latency (ns)": 658816178273, "Mean latency (ns)": 329686797734, "Min duration satisfied": "Yes", "Min latency (ns)": 485424657, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 31172.9, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30 (1x1g.6gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 31172.9, "characteristics.samples_per_second.normalized_per_core": 31172.9, "characteristics.samples_per_second.normalized_per_processor": 31172.9, "ck_system": "A30-MIG_1x1g.6gb_TRT_HeteroMultiUse", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A30-MIG_1x1g.6gb_TRT_HeteroMultiUse", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 20537220, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30-MIG_1x1g.6gb_TRT_HeteroMultiUse", "system_name": "Gigabyte G482-Z54 (1x A30-MIG-1x1g.6gb, TensorRT, HeteroMultiUse)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 31117, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "3ac461c445bcb341", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 360831083854, "90.00 percentile latency (ns)": 648940745429, "95.00 percentile latency (ns)": 684977771169, "97.00 percentile latency (ns)": 699377387134, "99.00 percentile latency (ns)": 713759444854, "99.90 percentile latency (ns)": 720235685546, "Max latency (ns)": 720954189889, "Mean latency (ns)": 360854746538, "Min duration satisfied": "Yes", "Min latency (ns)": 617162430, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 36618.1, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 36618.1, "characteristics.samples_per_second.normalized_per_core": 36618.1, "characteristics.samples_per_second.normalized_per_processor": 36618.1, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 26400000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "system_name": "NVIDIA DGX A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT, HeteroMultiUse)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 40000, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "4376073e47d899b5", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 360831083854, "90.00 percentile latency (ns)": 648940745429, "95.00 percentile latency (ns)": 684977771169, "97.00 percentile latency (ns)": 699377387134, "99.00 percentile latency (ns)": 713759444854, "99.90 percentile latency (ns)": 720235685546, "Max latency (ns)": 720954189889, "Mean latency (ns)": 360854746538, "Min duration satisfied": "Yes", "Min latency (ns)": 617162430, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 36618.1, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 36618.1, "characteristics.samples_per_second.normalized_per_core": 36618.1, "characteristics.samples_per_second.normalized_per_processor": 36618.1, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 26400000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "system_name": "NVIDIA DGX A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT, HeteroMultiUse)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 40000, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "0138114250969689", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 331480015456, "90.00 percentile latency (ns)": 595184065593, "95.00 percentile latency (ns)": 628308423425, "97.00 percentile latency (ns)": 641583326967, "99.00 percentile latency (ns)": 656704579384, "99.90 percentile latency (ns)": 666997660476, "Max latency (ns)": 668516905868, "Mean latency (ns)": 24389177195, "Min duration satisfied": "Yes", "Min latency (ns)": 4347358314, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 987260, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 1254.988938714499, "characteristics.power.normalized_per_core": 313.74723467862475, "characteristics.power.normalized_per_processor": 313.74723467862475, "characteristics.samples_per_second": 987260, "characteristics.samples_per_second.normalized_per_core": 246815.0, "characteristics.samples_per_second.normalized_per_processor": 246815.0, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 660000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "51810c0f5fc19a9b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 331480015456, "90.00 percentile latency (ns)": 595184065593, "95.00 percentile latency (ns)": 628308423425, "97.00 percentile latency (ns)": 641583326967, "99.00 percentile latency (ns)": 656704579384, "99.90 percentile latency (ns)": 666997660476, "Max latency (ns)": 668516905868, "Mean latency (ns)": 24389177195, "Min duration satisfied": "Yes", "Min latency (ns)": 4347358314, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 987260, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 1254.988938714499, "characteristics.power.normalized_per_core": 313.74723467862475, "characteristics.power.normalized_per_processor": 313.74723467862475, "characteristics.samples_per_second": 987260, "characteristics.samples_per_second.normalized_per_core": 246815.0, "characteristics.samples_per_second.normalized_per_processor": 246815.0, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 660000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "4ea695ab510b7557", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 349070602182, "90.00 percentile latency (ns)": 624669875066, "95.00 percentile latency (ns)": 659116935231, "97.00 percentile latency (ns)": 672900476562, "99.00 percentile latency (ns)": 687371069461, "99.90 percentile latency (ns)": 698914973417, "Max latency (ns)": 703239205048, "Mean latency (ns)": 12722470379, "Min duration satisfied": "Yes", "Min latency (ns)": 5506651365, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2027190.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2027190.0, "characteristics.samples_per_second.normalized_per_core": 253398.75, "characteristics.samples_per_second.normalized_per_processor": 253398.75, "ck_system": "A100-PCIex8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIex8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1425600000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex8_TRT", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2160000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "47348beeae0ca978", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 349070602182, "90.00 percentile latency (ns)": 624669875066, "95.00 percentile latency (ns)": 659116935231, "97.00 percentile latency (ns)": 672900476562, "99.00 percentile latency (ns)": 687371069461, "99.90 percentile latency (ns)": 698914973417, "Max latency (ns)": 703239205048, "Mean latency (ns)": 12722470379, "Min duration satisfied": "Yes", "Min latency (ns)": 5506651365, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2027190.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2027190.0, "characteristics.samples_per_second.normalized_per_core": 253398.75, "characteristics.samples_per_second.normalized_per_processor": 253398.75, "ck_system": "A100-PCIex8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIex8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1425600000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex8_TRT", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2160000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "e94ca4b7adc13e66", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 314262463873, "90.00 percentile latency (ns)": 565256637925, "95.00 percentile latency (ns)": 596655696902, "97.00 percentile latency (ns)": 609201313746, "99.00 percentile latency (ns)": 621709108285, "99.90 percentile latency (ns)": 627362020754, "Max latency (ns)": 627993430933, "Mean latency (ns)": 17235947076, "Min duration satisfied": "Yes", "Min latency (ns)": 9649511, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1681550.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1681550.0, "characteristics.samples_per_second.normalized_per_core": 210193.75, "characteristics.samples_per_second.normalized_per_processor": 210193.75, "ck_system": "A100-PCIex8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIex8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1056000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1600000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "7d47cb436f9cddb6", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 314262463873, "90.00 percentile latency (ns)": 565256637925, "95.00 percentile latency (ns)": 596655696902, "97.00 percentile latency (ns)": 609201313746, "99.00 percentile latency (ns)": 621709108285, "99.90 percentile latency (ns)": 627362020754, "Max latency (ns)": 627993430933, "Mean latency (ns)": 17235947076, "Min duration satisfied": "Yes", "Min latency (ns)": 9649511, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1681550.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1681550.0, "characteristics.samples_per_second.normalized_per_core": 210193.75, "characteristics.samples_per_second.normalized_per_processor": 210193.75, "ck_system": "A100-PCIex8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIex8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1056000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1600000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "f723cc48748de902", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 326507101165, "90.00 percentile latency (ns)": 587154702689, "95.00 percentile latency (ns)": 619507578261, "97.00 percentile latency (ns)": 632486851611, "99.00 percentile latency (ns)": 645429271311, "99.90 percentile latency (ns)": 651255142841, "Max latency (ns)": 651902814039, "Mean latency (ns)": 114290341, "Min duration satisfied": "Yes", "Min latency (ns)": 5725600, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 2429810.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2429810.0, "characteristics.samples_per_second.normalized_per_core": 303726.25, "characteristics.samples_per_second.normalized_per_processor": 303726.25, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1584000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT_Triton", "system_name": "NVIDIA DGX A100 (8x A100-SXM-80GB, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "6b39f73db4cd7a60", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 326507101165, "90.00 percentile latency (ns)": 587154702689, "95.00 percentile latency (ns)": 619507578261, "97.00 percentile latency (ns)": 632486851611, "99.00 percentile latency (ns)": 645429271311, "99.90 percentile latency (ns)": 651255142841, "Max latency (ns)": 651902814039, "Mean latency (ns)": 114290341, "Min duration satisfied": "Yes", "Min latency (ns)": 5725600, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 2429810.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2429810.0, "characteristics.samples_per_second.normalized_per_core": 303726.25, "characteristics.samples_per_second.normalized_per_processor": 303726.25, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1584000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT_Triton", "system_name": "NVIDIA DGX A100 (8x A100-SXM-80GB, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "00846fdab79f41ea", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 388286789906, "90.00 percentile latency (ns)": 683598840015, "95.00 percentile latency (ns)": 720542587681, "97.00 percentile latency (ns)": 735327525411, "99.00 percentile latency (ns)": 750119897796, "99.90 percentile latency (ns)": 756772329989, "Max latency (ns)": 757512036111, "Mean latency (ns)": 4384359331, "Min duration satisfied": "Yes", "Min latency (ns)": 22112730500, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2091060.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 3324.244591029025, "characteristics.power.normalized_per_core": 415.5305738786281, "characteristics.power.normalized_per_processor": 415.5305738786281, "characteristics.samples_per_second": 2091060.0, "characteristics.samples_per_second.normalized_per_core": 261382.5, "characteristics.samples_per_second.normalized_per_processor": 261382.5, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1584000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "system_name": "NVIDIA DGX A100 (8x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "3782bf209bb6433d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 388286789906, "90.00 percentile latency (ns)": 683598840015, "95.00 percentile latency (ns)": 720542587681, "97.00 percentile latency (ns)": 735327525411, "99.00 percentile latency (ns)": 750119897796, "99.90 percentile latency (ns)": 756772329989, "Max latency (ns)": 757512036111, "Mean latency (ns)": 4384359331, "Min duration satisfied": "Yes", "Min latency (ns)": 22112730500, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2091060.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 3324.244591029025, "characteristics.power.normalized_per_core": 415.5305738786281, "characteristics.power.normalized_per_processor": 415.5305738786281, "characteristics.samples_per_second": 2091060.0, "characteristics.samples_per_second.normalized_per_core": 261382.5, "characteristics.samples_per_second.normalized_per_processor": 261382.5, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1584000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "system_name": "NVIDIA DGX A100 (8x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "9c9c31aa740b0f04", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 309252972346, "90.00 percentile latency (ns)": 555663468394, "95.00 percentile latency (ns)": 586455695421, "97.00 percentile latency (ns)": 598770349669, "99.00 percentile latency (ns)": 611103151546, "99.90 percentile latency (ns)": 616647842268, "Max latency (ns)": 617261521540, "Mean latency (ns)": 12335900780, "Min duration satisfied": "Yes", "Min latency (ns)": 7212672, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1710780.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1710780.0, "characteristics.samples_per_second.normalized_per_core": 213847.5, "characteristics.samples_per_second.normalized_per_processor": 213847.5, "ck_system": "A100-PCIe-80GBx8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GBx8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1056000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GBx8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe-80GB, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1600000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "a4a5564f17b013f9", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 309252972346, "90.00 percentile latency (ns)": 555663468394, "95.00 percentile latency (ns)": 586455695421, "97.00 percentile latency (ns)": 598770349669, "99.00 percentile latency (ns)": 611103151546, "99.90 percentile latency (ns)": 616647842268, "Max latency (ns)": 617261521540, "Mean latency (ns)": 12335900780, "Min duration satisfied": "Yes", "Min latency (ns)": 7212672, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1710780.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1710780.0, "characteristics.samples_per_second.normalized_per_core": 213847.5, "characteristics.samples_per_second.normalized_per_processor": 213847.5, "ck_system": "A100-PCIe-80GBx8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A100-PCIe-80GBx8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0, Triton 21.07", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 1056000000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIe-80GBx8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A100-PCIe-80GB, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1600000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "9b0d57861f3b3c32", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 340872134598, "90.00 percentile latency (ns)": 607499909469, "95.00 percentile latency (ns)": 640830591343, "97.00 percentile latency (ns)": 654161281135, "99.00 percentile latency (ns)": 667492925544, "99.90 percentile latency (ns)": 675303852434, "Max latency (ns)": 676642229641, "Mean latency (ns)": 23411045764, "Min duration satisfied": "Yes", "Min latency (ns)": 8880325528, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 772521, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 772521, "characteristics.samples_per_second.normalized_per_core": 96565.125, "characteristics.samples_per_second.normalized_per_processor": 96565.125, "ck_system": "A10x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A10x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 522720000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "a83137d073becea8", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 340872134598, "90.00 percentile latency (ns)": 607499909469, "95.00 percentile latency (ns)": 640830591343, "97.00 percentile latency (ns)": 654161281135, "99.00 percentile latency (ns)": 667492925544, "99.90 percentile latency (ns)": 675303852434, "Max latency (ns)": 676642229641, "Mean latency (ns)": 23411045764, "Min duration satisfied": "Yes", "Min latency (ns)": 8880325528, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 772521, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 772521, "characteristics.samples_per_second.normalized_per_core": 96565.125, "characteristics.samples_per_second.normalized_per_processor": 96565.125, "ck_system": "A10x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 8.0.1, CUDA 11.3", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.1, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.1/tree/master/closed/NVIDIA/results/A10x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 20.04.4", "other_hardware": "", "other_software_stack": "TensorRT 8.0.1, CUDA 11.3, cuDNN 8.2.1, Driver 470.42.01, DALI 0.31.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 1624344308455410291, "retraining": "No", "sample_index_rng_seed": 517984244576520566, "samples_per_query": 522720000, "schedule_rng_seed": 10051496985653635065, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "cdc9939407fba19f", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332424145729, "90.00 percentile latency (ns)": 598767268210, "95.00 percentile latency (ns)": 632057196664, "97.00 percentile latency (ns)": 645374995874, "99.00 percentile latency (ns)": 658692339179, "99.90 percentile latency (ns)": 664686564641, "Max latency (ns)": 665351565674, "Mean latency (ns)": 14750814281, "Min duration satisfied": "Yes", "Min latency (ns)": 10648432, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 785630, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 785630, "characteristics.samples_per_second.normalized_per_core": 98203.75, "characteristics.samples_per_second.normalized_per_processor": 98203.75, "ck_system": "A10x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A10x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 522720000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT_Triton", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "b904ebcb6ea4bf59", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332424145729, "90.00 percentile latency (ns)": 598767268210, "95.00 percentile latency (ns)": 632057196664, "97.00 percentile latency (ns)": 645374995874, "99.00 percentile latency (ns)": 658692339179, "99.90 percentile latency (ns)": 664686564641, "Max latency (ns)": 665351565674, "Mean latency (ns)": 14750814281, "Min duration satisfied": "Yes", "Min latency (ns)": 10648432, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 785630, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 785630, "characteristics.samples_per_second.normalized_per_core": 98203.75, "characteristics.samples_per_second.normalized_per_processor": 98203.75, "ck_system": "A10x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A10x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 522720000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT_Triton", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "ad4b8abc73a87e43", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 328965676289, "90.00 percentile latency (ns)": 590820888404, "95.00 percentile latency (ns)": 623485377391, "97.00 percentile latency (ns)": 636525035427, "99.00 percentile latency (ns)": 649612898577, "99.90 percentile latency (ns)": 655488072391, "Max latency (ns)": 656135410160, "Mean latency (ns)": 58717370642, "Min duration satisfied": "Yes", "Min latency (ns)": 2880233029, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 311826, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 311826, "characteristics.samples_per_second.normalized_per_core": 311826.0, "characteristics.samples_per_second.normalized_per_processor": 311826.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_datacenter", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_datacenter", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 204600000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_datacenter", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 310000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "03caf32f9e76d4d1", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 328965676289, "90.00 percentile latency (ns)": 590820888404, "95.00 percentile latency (ns)": 623485377391, "97.00 percentile latency (ns)": 636525035427, "99.00 percentile latency (ns)": 649612898577, "99.90 percentile latency (ns)": 655488072391, "Max latency (ns)": 656135410160, "Mean latency (ns)": 58717370642, "Min duration satisfied": "Yes", "Min latency (ns)": 2880233029, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 311826, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 311826, "characteristics.samples_per_second.normalized_per_core": 311826.0, "characteristics.samples_per_second.normalized_per_processor": 311826.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_datacenter", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_datacenter", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 204600000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_datacenter", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 310000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "1d493064ee38e82d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332905615027, "90.00 percentile latency (ns)": 599636489782, "95.00 percentile latency (ns)": 632964928158, "97.00 percentile latency (ns)": 646305158068, "99.00 percentile latency (ns)": 659640362825, "99.90 percentile latency (ns)": 665640901390, "Max latency (ns)": 666316778144, "Mean latency (ns)": 62572300933, "Min duration satisfied": "Yes", "Min latency (ns)": 4463850, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 307061, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 307061, "characteristics.samples_per_second.normalized_per_core": 307061.0, "characteristics.samples_per_second.normalized_per_processor": 307061.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_Triton_datacenter", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_Triton_datacenter", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 204600000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_Triton_datacenter", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 310000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "60d33b8e64e0e9f5", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332905615027, "90.00 percentile latency (ns)": 599636489782, "95.00 percentile latency (ns)": 632964928158, "97.00 percentile latency (ns)": 646305158068, "99.00 percentile latency (ns)": 659640362825, "99.90 percentile latency (ns)": 665640901390, "Max latency (ns)": 666316778144, "Mean latency (ns)": 62572300933, "Min duration satisfied": "Yes", "Min latency (ns)": 4463850, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 307061, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 307061, "characteristics.samples_per_second.normalized_per_core": 307061.0, "characteristics.samples_per_second.normalized_per_processor": 307061.0, "ck_system": "DGX-A100_A100-SXM-80GBx1_TRT_Triton_datacenter", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx1_TRT_Triton_datacenter", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0, Triton 21.02; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 204600000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx1_TRT_Triton_datacenter", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 310000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "088b86242f6ab2eb", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 354661045085, "90.00 percentile latency (ns)": 624920443090, "95.00 percentile latency (ns)": 658712499265, "97.00 percentile latency (ns)": 672210392071, "99.00 percentile latency (ns)": 685706790840, "99.90 percentile latency (ns)": 691774009738, "Max latency (ns)": 692449514867, "Mean latency (ns)": 5301161378, "Min duration satisfied": "Yes", "Min latency (ns)": 17140018716, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1067510.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1067510.0, "characteristics.samples_per_second.normalized_per_core": 133438.75, "characteristics.samples_per_second.normalized_per_processor": 133438.75, "ck_system": "A30x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 739200000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "99c3f7ba8417f8fa", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 354661045085, "90.00 percentile latency (ns)": 624920443090, "95.00 percentile latency (ns)": 658712499265, "97.00 percentile latency (ns)": 672210392071, "99.00 percentile latency (ns)": 685706790840, "99.90 percentile latency (ns)": 691774009738, "Max latency (ns)": 692449514867, "Mean latency (ns)": 5301161378, "Min duration satisfied": "Yes", "Min latency (ns)": 17140018716, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1067510.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1067510.0, "characteristics.samples_per_second.normalized_per_core": 133438.75, "characteristics.samples_per_second.normalized_per_processor": 133438.75, "ck_system": "A30x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 739200000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "bc81401a9fbc38ce", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 337836814354, "90.00 percentile latency (ns)": 608294741770, "95.00 percentile latency (ns)": 642092993465, "97.00 percentile latency (ns)": 655617934461, "99.00 percentile latency (ns)": 669160734714, "99.90 percentile latency (ns)": 675248827978, "Max latency (ns)": 675923166836, "Mean latency (ns)": 13429036839, "Min duration satisfied": "Yes", "Min latency (ns)": 13780237, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1093620.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1093620.0, "characteristics.samples_per_second.normalized_per_core": 136702.5, "characteristics.samples_per_second.normalized_per_processor": 136702.5, "ck_system": "A30x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 739200000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "24d50bbdb944d39a", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 337836814354, "90.00 percentile latency (ns)": 608294741770, "95.00 percentile latency (ns)": 642092993465, "97.00 percentile latency (ns)": 655617934461, "99.00 percentile latency (ns)": 669160734714, "99.90 percentile latency (ns)": 675248827978, "Max latency (ns)": 675923166836, "Mean latency (ns)": 13429036839, "Min duration satisfied": "Yes", "Min latency (ns)": 13780237, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "Triton_Server", "Samples per second": 1093620.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A30", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1093620.0, "characteristics.samples_per_second.normalized_per_core": 136702.5, "characteristics.samples_per_second.normalized_per_processor": 136702.5, "ck_system": "A30x8_TRT_Triton", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A30x8_TRT_Triton", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.46, DALI 0.30.0, Triton 21.02", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 739200000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A30x8_TRT_Triton", "system_name": "Gigabyte G482-Z54 (8x A30, TensorRT, Triton)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "fc4c5aa9a93bea22", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332694345892, "90.00 percentile latency (ns)": 581153923826, "95.00 percentile latency (ns)": 612214224015, "97.00 percentile latency (ns)": 624666047220, "99.00 percentile latency (ns)": 637080742592, "99.90 percentile latency (ns)": 642679376775, "Max latency (ns)": 643300593019, "Mean latency (ns)": 6699927866, "Min duration satisfied": "Yes", "Min latency (ns)": 22805509567, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2462300.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2462300.0, "characteristics.samples_per_second.normalized_per_core": 307787.5, "characteristics.samples_per_second.normalized_per_processor": 307787.5, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1584000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT", "system_name": "NVIDIA DGX-A100 (8x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "14ddb8889aab7738", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 332694345892, "90.00 percentile latency (ns)": 581153923826, "95.00 percentile latency (ns)": 612214224015, "97.00 percentile latency (ns)": 624666047220, "99.00 percentile latency (ns)": 637080742592, "99.90 percentile latency (ns)": 642679376775, "Max latency (ns)": 643300593019, "Mean latency (ns)": 6699927866, "Min duration satisfied": "Yes", "Min latency (ns)": 22805509567, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2462300.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 2462300.0, "characteristics.samples_per_second.normalized_per_core": 307787.5, "characteristics.samples_per_second.normalized_per_processor": 307787.5, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1584000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT", "system_name": "NVIDIA DGX-A100 (8x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "d7d59575f044ce99", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 308870549741, "90.00 percentile latency (ns)": 557037541938, "95.00 percentile latency (ns)": 588419036236, "97.00 percentile latency (ns)": 601007542283, "99.00 percentile latency (ns)": 613549639798, "99.90 percentile latency (ns)": 621730655791, "Max latency (ns)": 624081950442, "Mean latency (ns)": 2358271084, "Min duration satisfied": "Yes", "Min latency (ns)": 4217448721, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1057550.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1057550.0, "characteristics.samples_per_second.normalized_per_core": 264387.5, "characteristics.samples_per_second.normalized_per_processor": 264387.5, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 660000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "a71809754d430696", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 308870549741, "90.00 percentile latency (ns)": 557037541938, "95.00 percentile latency (ns)": 588419036236, "97.00 percentile latency (ns)": 601007542283, "99.00 percentile latency (ns)": 613549639798, "99.90 percentile latency (ns)": 621730655791, "Max latency (ns)": 624081950442, "Mean latency (ns)": 2358271084, "Min duration satisfied": "Yes", "Min latency (ns)": 4217448721, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1057550.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1057550.0, "characteristics.samples_per_second.normalized_per_core": 264387.5, "characteristics.samples_per_second.normalized_per_processor": 264387.5, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 660000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "ad74bcc8c65fa6cc", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 362187722442, "90.00 percentile latency (ns)": 651438887388, "95.00 percentile latency (ns)": 687639277353, "97.00 percentile latency (ns)": 702116065424, "99.00 percentile latency (ns)": 716596518057, "99.90 percentile latency (ns)": 723110373338, "Max latency (ns)": 723827274590, "Mean latency (ns)": 362178665272, "Min duration satisfied": "Yes", "Min latency (ns)": 610988809, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 36472.8, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 36472.8, "characteristics.samples_per_second.normalized_per_core": 36472.8, "characteristics.samples_per_second.normalized_per_processor": 36472.8, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 26400000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "system_name": "NVIDIA DGX-A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT, HeteroMultiUse)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 40000, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "1f8862dd62d1aac2", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 362187722442, "90.00 percentile latency (ns)": 651438887388, "95.00 percentile latency (ns)": 687639277353, "97.00 percentile latency (ns)": 702116065424, "99.00 percentile latency (ns)": 716596518057, "99.90 percentile latency (ns)": 723110373338, "Max latency (ns)": 723827274590, "Mean latency (ns)": 362178665272, "Min duration satisfied": "Yes", "Min latency (ns)": 610988809, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 36472.8, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB (1x1g.10gb MIG)", "accelerator_on-chip_memories": "", "accelerators_per_node": 1, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 36472.8, "characteristics.samples_per_second.normalized_per_core": 36472.8, "characteristics.samples_per_second.normalized_per_processor": 36472.8, "ck_system": "DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 1, "normalize_processors": 1, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 26400000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GB-MIG_1x1g.10gb_TRT_HeteroMultiUse", "system_name": "NVIDIA DGX-A100 (1x A100-SXM-80GB-MIG-1x1g.10gb, TensorRT, HeteroMultiUse)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 40000, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "eb6acf4e4cb1f53e", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 335465953493, "90.00 percentile latency (ns)": 603285042314, "95.00 percentile latency (ns)": 636774591443, "97.00 percentile latency (ns)": 650171060861, "99.00 percentile latency (ns)": 664677053650, "99.90 percentile latency (ns)": 674519481508, "Max latency (ns)": 677221388118, "Mean latency (ns)": 592624941, "Min duration satisfied": "Yes", "Min latency (ns)": 4285225558, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 974571, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 1277.8926144756279, "characteristics.power.normalized_per_core": 319.47315361890696, "characteristics.power.normalized_per_processor": 319.47315361890696, "characteristics.samples_per_second": 974571, "characteristics.samples_per_second.normalized_per_core": 243642.75, "characteristics.samples_per_second.normalized_per_processor": 243642.75, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 660000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "cbf89ff94698017d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 335465953493, "90.00 percentile latency (ns)": 603285042314, "95.00 percentile latency (ns)": 636774591443, "97.00 percentile latency (ns)": 650171060861, "99.00 percentile latency (ns)": 664677053650, "99.90 percentile latency (ns)": 674519481508, "Max latency (ns)": 677221388118, "Mean latency (ns)": 592624941, "Min duration satisfied": "Yes", "Min latency (ns)": 4285225558, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 974571, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 1277.8926144756279, "characteristics.power.normalized_per_core": 319.47315361890696, "characteristics.power.normalized_per_processor": 319.47315361890696, "characteristics.samples_per_second": 974571, "characteristics.samples_per_second.normalized_per_core": 243642.75, "characteristics.samples_per_second.normalized_per_processor": 243642.75, "ck_system": "DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 1, "host_storage_capacity": "10 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 660000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-Station-A100_A100-SXM-80GBx4_TRT_MaxQ", "system_name": "NVIDIA DGX Station A100 (4x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": "1e+06", "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "6793ab229d17b30b", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 384583265090, "90.00 percentile latency (ns)": 675777615717, "95.00 percentile latency (ns)": 712189793390, "97.00 percentile latency (ns)": 726753311739, "99.00 percentile latency (ns)": 741316386567, "99.90 percentile latency (ns)": 747871359806, "Max latency (ns)": 748599357039, "Mean latency (ns)": 563925525, "Min duration satisfied": "Yes", "Min latency (ns)": 22507442172, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2115950.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 3415.354010695183, "characteristics.power.normalized_per_core": 426.9192513368979, "characteristics.power.normalized_per_processor": 426.9192513368979, "characteristics.samples_per_second": 2115950.0, "characteristics.samples_per_second.normalized_per_core": 264493.75, "characteristics.samples_per_second.normalized_per_processor": 264493.75, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1584000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "system_name": "NVIDIA DGX-A100 (8x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "22d52a40ee4d2296", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 384583265090, "90.00 percentile latency (ns)": 675777615717, "95.00 percentile latency (ns)": 712189793390, "97.00 percentile latency (ns)": 726753311739, "99.00 percentile latency (ns)": 741316386567, "99.90 percentile latency (ns)": 747871359806, "Max latency (ns)": 748599357039, "Mean latency (ns)": 563925525, "Min duration satisfied": "Yes", "Min latency (ns)": 22507442172, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2115950.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "80 GB", "accelerator_memory_configuration": "HBM2e", "accelerator_model_name": "NVIDIA A100-SXM-80GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.power": 3415.354010695183, "characteristics.power.normalized_per_core": 426.9192513368979, "characteristics.power.normalized_per_processor": 426.9192513368979, "characteristics.samples_per_second": 2115950.0, "characteristics.samples_per_second.normalized_per_core": 264493.75, "characteristics.samples_per_second.normalized_per_processor": 264493.75, "ck_system": "DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "2 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 1584000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM-80GBx8_TRT_MaxQ", "system_name": "NVIDIA DGX-A100 (8x A100-SXM-80GB, MaxQ, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2400000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "00c24e65b0f59d93", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 340033298278, "90.00 percentile latency (ns)": 605800760061, "95.00 percentile latency (ns)": 639029913724, "97.00 percentile latency (ns)": 652317667153, "99.00 percentile latency (ns)": 665607160810, "99.90 percentile latency (ns)": 675431952182, "Max latency (ns)": 676766694363, "Mean latency (ns)": 22546370778, "Min duration satisfied": "Yes", "Min latency (ns)": 8840204478, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 772378, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 772378, "characteristics.samples_per_second.normalized_per_core": 96547.25, "characteristics.samples_per_second.normalized_per_processor": 96547.25, "ck_system": "A10x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A10x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 522720000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "1804f01a0fcca66d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 340033298278, "90.00 percentile latency (ns)": 605800760061, "95.00 percentile latency (ns)": 639029913724, "97.00 percentile latency (ns)": 652317667153, "99.00 percentile latency (ns)": 665607160810, "99.90 percentile latency (ns)": 675431952182, "Max latency (ns)": 676766694363, "Mean latency (ns)": 22546370778, "Min duration satisfied": "Yes", "Min latency (ns)": 8840204478, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 772378, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A10", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 772378, "characteristics.samples_per_second.normalized_per_core": 96547.25, "characteristics.samples_per_second.normalized_per_processor": 96547.25, "ck_system": "A10x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/NVIDIA/results/A10x8_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.4", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 522720000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "preview", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A10x8_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x A10, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 792000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "471e6da41a4884d9", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 34100820843, "90.00 percentile latency (ns)": 60710032626, "95.00 percentile latency (ns)": 64071400433, "97.00 percentile latency (ns)": 65413069997, "99.00 percentile latency (ns)": 66750827356, "99.90 percentile latency (ns)": 67356150237, "Max latency (ns)": 67423238172, "Mean latency (ns)": 34208771388, "Min duration satisfied": "Yes", "Min latency (ns)": 1365570005, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 665646, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 20, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 665646, "characteristics.samples_per_second.normalized_per_core": 33282.3, "characteristics.samples_per_second.normalized_per_processor": 33282.3, "ck_system": "T4x20_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 20, "normalize_processors": 20, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x20_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 44880000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x20_TRT", "system_name": "Supermicro 6049GP-TRT-OTO-29 (20x T4, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 680000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "2c91822c3c382230", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 34100820843, "90.00 percentile latency (ns)": 60710032626, "95.00 percentile latency (ns)": 64071400433, "97.00 percentile latency (ns)": 65413069997, "99.00 percentile latency (ns)": 66750827356, "99.90 percentile latency (ns)": 67356150237, "Max latency (ns)": 67423238172, "Mean latency (ns)": 34208771388, "Min duration satisfied": "Yes", "Min latency (ns)": 1365570005, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 665646, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 20, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 665646, "characteristics.samples_per_second.normalized_per_core": 33282.3, "characteristics.samples_per_second.normalized_per_processor": 33282.3, "ck_system": "T4x20_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 20, "normalize_processors": 20, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x20_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 44880000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x20_TRT", "system_name": "Supermicro 6049GP-TRT-OTO-29 (20x T4, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 680000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "413982d1c786150f", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 39334425324, "90.00 percentile latency (ns)": 69972684112, "95.00 percentile latency (ns)": 73811540741, "97.00 percentile latency (ns)": 75348187018, "99.00 percentile latency (ns)": 76887578395, "99.90 percentile latency (ns)": 77579311062, "Max latency (ns)": 77654717858, "Mean latency (ns)": 39401593368, "Min duration satisfied": "Yes", "Min latency (ns)": 1342955268, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 458955, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 2, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 458955, "characteristics.samples_per_second.normalized_per_core": 229477.5, "characteristics.samples_per_second.normalized_per_processor": 229477.5, "ck_system": "A100-PCIex2_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 2, "normalize_processors": 2, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex2_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 35640000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex2_TRT", "system_name": "Gigabyte G482-Z52 (2x A100-PCIe, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 540000, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "96cb7a7584cd9c61", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 39334425324, "90.00 percentile latency (ns)": 69972684112, "95.00 percentile latency (ns)": 73811540741, "97.00 percentile latency (ns)": 75348187018, "99.00 percentile latency (ns)": 76887578395, "99.90 percentile latency (ns)": 77579311062, "Max latency (ns)": 77654717858, "Mean latency (ns)": 39401593368, "Min duration satisfied": "Yes", "Min latency (ns)": 1342955268, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 458955, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe", "accelerator_on-chip_memories": "", "accelerators_per_node": 2, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 458955, "characteristics.samples_per_second.normalized_per_core": 229477.5, "characteristics.samples_per_second.normalized_per_processor": 229477.5, "ck_system": "A100-PCIex2_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 2, "normalize_processors": 2, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/A100-PCIex2_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 35640000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/A100-PCIex2_TRT", "system_name": "Gigabyte G482-Z52 (2x A100-PCIe, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 540000, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "64edb6d6b3b414e8", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 34032204969, "90.00 percentile latency (ns)": 59771729371, "95.00 percentile latency (ns)": 62988013532, "97.00 percentile latency (ns)": 64273262138, "99.00 percentile latency (ns)": 65560021788, "99.90 percentile latency (ns)": 66138842658, "Max latency (ns)": 66202788343, "Mean latency (ns)": 34063880302, "Min duration satisfied": "Yes", "Min latency (ns)": 2007670951, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2113510.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 2113510.0, "characteristics.samples_per_second.normalized_per_core": 264188.75, "characteristics.samples_per_second.normalized_per_processor": 264188.75, "ck_system": "DGX-A100_A100-SXM4x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x8_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 139920000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x8_TRT", "system_name": "NVIDIA DGX-A100 (8x A100-SXM4, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "dade848efab60c58", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 34032204969, "90.00 percentile latency (ns)": 59771729371, "95.00 percentile latency (ns)": 62988013532, "97.00 percentile latency (ns)": 64273262138, "99.00 percentile latency (ns)": 65560021788, "99.90 percentile latency (ns)": 66138842658, "Max latency (ns)": 66202788343, "Mean latency (ns)": 34063880302, "Min duration satisfied": "Yes", "Min latency (ns)": 2007670951, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 2113510.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-SXM4", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 2113510.0, "characteristics.samples_per_second.normalized_per_core": 264188.75, "characteristics.samples_per_second.normalized_per_processor": 264188.75, "ck_system": "DGX-A100_A100-SXM4x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "1 TB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 64, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7742", "host_processors_per_node": 2, "host_storage_capacity": "15 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/DGX-A100_A100-SXM4x8_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 139920000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DGX-A100_A100-SXM4x8_TRT", "system_name": "NVIDIA DGX-A100 (8x A100-SXM4, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 2120000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 128, "uid": "68a84fc74de5a395", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 33014604822, "90.00 percentile latency (ns)": 59291234955, "95.00 percentile latency (ns)": 62590934074, "97.00 percentile latency (ns)": 63913046482, "99.00 percentile latency (ns)": 65233371635, "99.90 percentile latency (ns)": 65830039551, "Max latency (ns)": 65899193721, "Mean latency (ns)": 33107597352, "Min duration satisfied": "Yes", "Min latency (ns)": 603993098, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 272416, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 272416, "characteristics.samples_per_second.normalized_per_core": 34052.0, "characteristics.samples_per_second.normalized_per_processor": 34052.0, "ck_system": "T4x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x8_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17952000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x8_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x T4, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "46b3839044788598", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 33014604822, "90.00 percentile latency (ns)": 59291234955, "95.00 percentile latency (ns)": 62590934074, "97.00 percentile latency (ns)": 63913046482, "99.00 percentile latency (ns)": 65233371635, "99.90 percentile latency (ns)": 65830039551, "Max latency (ns)": 65899193721, "Mean latency (ns)": 33107597352, "Min duration satisfied": "Yes", "Min latency (ns)": 603993098, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 272416, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 272416, "characteristics.samples_per_second.normalized_per_core": 34052.0, "characteristics.samples_per_second.normalized_per_processor": 34052.0, "ck_system": "T4x8_TRT", "ck_used": false, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 28, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "Intel(R) Xeon(R) Platinum 8280 CPU @ 2.70GHz", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/NVIDIA/results/T4x8_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.4", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17952000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "NVIDIA", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/NVIDIA", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/T4x8_TRT", "system_name": "Supermicro 4029GP-TRT-OTO-28 (8x T4, TensorRT)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 56, "uid": "748c09e1a15a46da", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" } ]