[ { "50.00 percentile latency (ns)": 666769154949, "90.00 percentile latency (ns)": 1196022668177, "95.00 percentile latency (ns)": 1262176733719, "97.00 percentile latency (ns)": 1288628147626, "99.00 percentile latency (ns)": 1315091077255, "99.90 percentile latency (ns)": 1326995771713, "Max latency (ns)": 1328313399843, "Mean latency (ns)": 50230098927, "Min duration satisfied": "Yes", "Min latency (ns)": 5076857454, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 135149, "Scenario": "offline", "accelerator_cooling_type": "passive", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "1.1.3", "characteristics.power": 829.0858540255822, "characteristics.power.normalized_per_core": 207.27146350639555, "characteristics.power.normalized_per_processor": 207.27146350639555, "characteristics.samples_per_second": 135149, "characteristics.samples_per_second.normalized_per_core": 33787.25, "characteristics.samples_per_second.normalized_per_processor": 33787.25, "ck_system": "XE2420_datacenter_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "filesystem": "ext3/ext4", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_cooling_type": "passive", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "4.40.10.00", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE2420_datacenter_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_settings": "closed/DellEMC/measurements/XE2420_datacenter_T4x4_TRT/XE2420_T4x4_power_settings.adoc", "print_timestamps": 0, "problem": false, "psu_details": "2x2000W", "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 179520000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_cooling_type": "air", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_datacenter_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "637866c5a1d5446f", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 666572186611, "90.00 percentile latency (ns)": 1195647522900, "95.00 percentile latency (ns)": 1261788772512, "97.00 percentile latency (ns)": 1288241972985, "99.00 percentile latency (ns)": 1314693516602, "99.90 percentile latency (ns)": 1326603156299, "Max latency (ns)": 1327919982242, "Mean latency (ns)": 50037665267, "Min duration satisfied": "Yes", "Min latency (ns)": 5085462432, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 135189, "Scenario": "offline", "accelerator_cooling_type": "passive", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "1.1.3", "characteristics.power": 830.1261295180716, "characteristics.power.normalized_per_core": 207.5315323795179, "characteristics.power.normalized_per_processor": 207.5315323795179, "characteristics.samples_per_second": 135189, "characteristics.samples_per_second.normalized_per_core": 33797.25, "characteristics.samples_per_second.normalized_per_processor": 33797.25, "ck_system": "XE2420_datacenter_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "filesystem": "ext3/ext4", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_cooling_type": "passive", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "4 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "4.40.10.00", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/XE2420_datacenter_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_settings": "closed/DellEMC/measurements/XE2420_datacenter_T4x4_TRT/XE2420_T4x4_power_settings.adoc", "print_timestamps": 0, "problem": false, "psu_details": "2x2000W", "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 179520000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_cooling_type": "air", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_datacenter_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "7dab3c4a14ba642d", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 352111163800, "90.00 percentile latency (ns)": 616989048770, "95.00 percentile latency (ns)": 650086935677, "97.00 percentile latency (ns)": 663334027333, "99.00 percentile latency (ns)": 676579936649, "99.90 percentile latency (ns)": 682543092831, "Max latency (ns)": 683205422756, "Mean latency (ns)": 8223362799, "Min duration satisfied": "Yes", "Min latency (ns)": 21849491649, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1255840.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A40", "accelerator_on-chip_memories": "", "accelerators_per_node": 10, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1255840.0, "characteristics.samples_per_second.normalized_per_core": 125584.0, "characteristics.samples_per_second.normalized_per_processor": 125584.0, "ck_system": "DSS8440_A40x10_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "3.84 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 10, "normalize_processors": 10, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A40x10_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 858000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A40x10_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1300000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "4ed6ad4388514332", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 352111163800, "90.00 percentile latency (ns)": 616989048770, "95.00 percentile latency (ns)": 650086935677, "97.00 percentile latency (ns)": 663334027333, "99.00 percentile latency (ns)": 676579936649, "99.90 percentile latency (ns)": 682543092831, "Max latency (ns)": 683205422756, "Mean latency (ns)": 8223362799, "Min duration satisfied": "Yes", "Min latency (ns)": 21849491649, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 1255840.0, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA A40", "accelerator_on-chip_memories": "", "accelerators_per_node": 10, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "boot_firmware_version": "", "characteristics.samples_per_second": 1255840.0, "characteristics.samples_per_second.normalized_per_core": 125584.0, "characteristics.samples_per_second.normalized_per_processor": 125584.0, "ck_system": "DSS8440_A40x10_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "disk_controllers": "", "disk_drives": "", "division": "closed", "filesystem": "", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "3.84 TB", "host_storage_type": "NVMe SSD", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "management_firmware_version": "", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "network_speed_mbit": "", "nics_enabled_connected": "", "nics_enabled_firmware": "", "nics_enabled_os": "", "normalize_cores": 10, "normalize_processors": 10, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/DSS8440_A40x10_TRT", "number_of_nodes": 1, "number_of_type_nics_installed": "", "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_hardware": "", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "power_management": "", "power_supply_details": "", "power_supply_quantity_and_rating_watts": "", "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 858000000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_A40x10_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 1300000.0, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "34bfa927b4d45f18", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 347261158360, "90.00 percentile latency (ns)": 616162722313, "95.00 percentile latency (ns)": 649767139142, "97.00 percentile latency (ns)": 663208322388, "99.00 percentile latency (ns)": 676650547922, "99.90 percentile latency (ns)": 682698738415, "Max latency (ns)": 683370396983, "Mean latency (ns)": 16388649619, "Min duration satisfied": "Yes", "Min latency (ns)": 12545464049, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 734009, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 3, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 734009, "characteristics.samples_per_second.normalized_per_core": 244669.66666666666, "characteristics.samples_per_second.normalized_per_processor": 244669.66666666666, "ck_system": "R740_A100-PCIe-40GBx3_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.8 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "normalize_cores": 3, "normalize_processors": 3, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R740_A100-PCIe-40GBx3_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 501600000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_A100-PCIe-40GBx3_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 760000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "29b24e4c8a4cfd5a", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 347261158360, "90.00 percentile latency (ns)": 616162722313, "95.00 percentile latency (ns)": 649767139142, "97.00 percentile latency (ns)": 663208322388, "99.00 percentile latency (ns)": 676650547922, "99.90 percentile latency (ns)": 682698738415, "Max latency (ns)": 683370396983, "Mean latency (ns)": 16388649619, "Min duration satisfied": "Yes", "Min latency (ns)": 12545464049, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 734009, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIe-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 3, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 734009, "characteristics.samples_per_second.normalized_per_core": 244669.66666666666, "characteristics.samples_per_second.normalized_per_processor": 244669.66666666666, "ck_system": "R740_A100-PCIe-40GBx3_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.8 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "normalize_cores": 3, "normalize_processors": 3, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R740_A100-PCIe-40GBx3_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 501600000, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_A100-PCIe-40GBx3_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 760000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "fb94fc094f71a9df", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 341188800214, "90.00 percentile latency (ns)": 611583575008, "95.00 percentile latency (ns)": 645381241388, "97.00 percentile latency (ns)": 658907525795, "99.00 percentile latency (ns)": 672427318554, "99.90 percentile latency (ns)": 678506754141, "Max latency (ns)": 679180949477, "Mean latency (ns)": 30252037449, "Min duration satisfied": "Yes", "Min latency (ns)": 4180252642, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 261974, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 8000", "accelerator_on-chip_memories": "", "accelerators_per_node": 3, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 261974, "characteristics.samples_per_second.normalized_per_core": 87324.66666666667, "characteristics.samples_per_second.normalized_per_processor": 87324.66666666667, "ck_system": "R7525_QuadroRTX8000x3_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.8 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "normalize_cores": 3, "normalize_processors": 3, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 177928080, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 269588, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "95a87e6b84866a26", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 341188800214, "90.00 percentile latency (ns)": 611583575008, "95.00 percentile latency (ns)": 645381241388, "97.00 percentile latency (ns)": 658907525795, "99.00 percentile latency (ns)": 672427318554, "99.90 percentile latency (ns)": 678506754141, "Max latency (ns)": 679180949477, "Mean latency (ns)": 30252037449, "Min duration satisfied": "Yes", "Min latency (ns)": 4180252642, "Min queries satisfied": "Yes", "Mode": "PerformanceOnly", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 261974, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 8000", "accelerator_on-chip_memories": "", "accelerators_per_node": 3, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 261974, "characteristics.samples_per_second.normalized_per_core": 87324.66666666667, "characteristics.samples_per_second.normalized_per_processor": 87324.66666666667, "ck_system": "R7525_QuadroRTX8000x3_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.3, CUDA 11.1", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.8 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 600000, "min_query_count": 1, "mlperf_version": 1.0, "normalize_cores": 3, "normalize_processors": 3, "note_code": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v1.0/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.3, CUDA 11.1, cuDNN 8.1.1, Driver 460.32.03, DALI 0.30.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": 0, "performance_issue_same_index": 0, "performance_issue_unique": 0, "performance_sample_count": 204800, "print_timestamps": 0, "problem": false, "qsl_rng_seed": 7322528924094909334, "retraining": "N", "sample_index_rng_seed": 1570999273408051088, "samples_per_query": 177928080, "schedule_rng_seed": 3507442325620259414, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 269588, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "35f647474f88a293", "use_accelerator": true, "weight_data_types": "int8", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 34665748929, "90.00 percentile latency (ns)": 62479975898, "95.00 percentile latency (ns)": 65973531648, "97.00 percentile latency (ns)": 67365661365, "99.00 percentile latency (ns)": 68756097326, "99.90 percentile latency (ns)": 69379965109, "Max latency (ns)": 69452552224, "Mean latency (ns)": 34778310304, "Min duration satisfied": "Yes", "Min latency (ns)": 495528167, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 277484, "Scenario": "offline", "accelerator_frequency": "1590MHz", "accelerator_host_interconnect": "PCI Express 3.0", "accelerator_interconnect": "PCI Express 3.0", "accelerator_interconnect_topology": "4 Accelerators per CPU", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Tesla T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 277484, "characteristics.samples_per_second.normalized_per_core": 34685.5, "characteristics.samples_per_second.normalized_per_processor": 34685.5, "ck_system": "R7525_T4x8_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "1 TB", "host_memory_configuration": "8x64GB DDR4-3200 HMAA8GR7AJR4N-XN RDIMM ECC", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "2MB+16MB+128MB", "host_processor_core_count": 120, "host_processor_frequency": "2.35GHz", "host_processor_interconnect": "Infinity Fabric", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "3 TB (5x800GB WUSTR6480ASS200 in RAID5)", "host_storage_type": "3D-TLC Solid State with 12Gbps SAS", "hw_notes": "ECC off", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_T4x8_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 19272000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_T4x8_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 292000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "4fbaa006d5c4e083", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 34665748929, "90.00 percentile latency (ns)": 62479975898, "95.00 percentile latency (ns)": 65973531648, "97.00 percentile latency (ns)": 67365661365, "99.00 percentile latency (ns)": 68756097326, "99.90 percentile latency (ns)": 69379965109, "Max latency (ns)": 69452552224, "Mean latency (ns)": 34778310304, "Min duration satisfied": "Yes", "Min latency (ns)": 495528167, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 277484, "Scenario": "offline", "accelerator_frequency": "1590MHz", "accelerator_host_interconnect": "PCI Express 3.0", "accelerator_interconnect": "PCI Express 3.0", "accelerator_interconnect_topology": "4 Accelerators per CPU", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Tesla T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 8, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 277484, "characteristics.samples_per_second.normalized_per_core": 34685.5, "characteristics.samples_per_second.normalized_per_processor": 34685.5, "ck_system": "R7525_T4x8_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "1 TB", "host_memory_configuration": "8x64GB DDR4-3200 HMAA8GR7AJR4N-XN RDIMM ECC", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "2MB+16MB+128MB", "host_processor_core_count": 120, "host_processor_frequency": "2.35GHz", "host_processor_interconnect": "Infinity Fabric", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "3 TB (5x800GB WUSTR6480ASS200 in RAID5)", "host_storage_type": "3D-TLC Solid State with 12Gbps SAS", "hw_notes": "ECC off", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 8, "normalize_processors": 8, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_T4x8_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 19272000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_T4x8_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 292000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "abce00314bd7bf58", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32866508174, "90.00 percentile latency (ns)": 58982688830, "95.00 percentile latency (ns)": 62244849496, "97.00 percentile latency (ns)": 63552752307, "99.00 percentile latency (ns)": 64853009108, "99.90 percentile latency (ns)": 65441378541, "Max latency (ns)": 65501750460, "Mean latency (ns)": 32895691756, "Min duration satisfied": "Yes", "Min latency (ns)": 338076873, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 140217, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 140217, "characteristics.samples_per_second.normalized_per_core": 35054.25, "characteristics.samples_per_second.normalized_per_processor": 35054.25, "ck_system": "XE2420_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.10GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/XE2420_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 9184428, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 139158, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "18be318c569860dd", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32866508174, "90.00 percentile latency (ns)": 58982688830, "95.00 percentile latency (ns)": 62244849496, "97.00 percentile latency (ns)": 63552752307, "99.00 percentile latency (ns)": 64853009108, "99.90 percentile latency (ns)": 65441378541, "Max latency (ns)": 65501750460, "Mean latency (ns)": 32895691756, "Min duration satisfied": "Yes", "Min latency (ns)": 338076873, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 140217, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 140217, "characteristics.samples_per_second.normalized_per_core": 35054.25, "characteristics.samples_per_second.normalized_per_processor": 35054.25, "ck_system": "XE2420_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.10GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/XE2420_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 9184428, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/XE2420_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 139158, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "eaef3d809449b920", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32933193158, "90.00 percentile latency (ns)": 58889004054, "95.00 percentile latency (ns)": 62146529596, "97.00 percentile latency (ns)": 63447388066, "99.00 percentile latency (ns)": 64759252714, "99.90 percentile latency (ns)": 65345742650, "Max latency (ns)": 65409754304, "Mean latency (ns)": 32981784777, "Min duration satisfied": "Yes", "Min latency (ns)": 795176660, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 454409, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIE-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 2, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 454409, "characteristics.samples_per_second.normalized_per_core": 227204.5, "characteristics.samples_per_second.normalized_per_processor": 227204.5, "ck_system": "R7525_A100x2_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.50GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 2, "normalize_processors": 2, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x2_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 29722770, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x2_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 450345, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "9b3ae60ae364dc89", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32933193158, "90.00 percentile latency (ns)": 58889004054, "95.00 percentile latency (ns)": 62146529596, "97.00 percentile latency (ns)": 63447388066, "99.00 percentile latency (ns)": 64759252714, "99.90 percentile latency (ns)": 65345742650, "Max latency (ns)": 65409754304, "Mean latency (ns)": 32981784777, "Min duration satisfied": "Yes", "Min latency (ns)": 795176660, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 454409, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "40 GB", "accelerator_memory_configuration": "HBM2", "accelerator_model_name": "NVIDIA A100-PCIE-40GB", "accelerator_on-chip_memories": "", "accelerators_per_node": 2, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 454409, "characteristics.samples_per_second.normalized_per_core": 227204.5, "characteristics.samples_per_second.normalized_per_processor": 227204.5, "ck_system": "R7525_A100x2_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.50GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 2, "normalize_processors": 2, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_A100x2_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 29722770, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_A100x2_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 450345, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "b2acbaa55bdf307e", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32121920849, "90.00 percentile latency (ns)": 56755278004, "95.00 percentile latency (ns)": 59839407988, "97.00 percentile latency (ns)": 61074563847, "99.00 percentile latency (ns)": 62308338650, "99.90 percentile latency (ns)": 62862464627, "Max latency (ns)": 62923299305, "Mean latency (ns)": 32160580672, "Min duration satisfied": "Yes", "Min latency (ns)": 1565211109, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 929411, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 8000", "accelerator_on-chip_memories": "", "accelerators_per_node": 10, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 929411, "characteristics.samples_per_second.normalized_per_core": 92941.1, "characteristics.samples_per_second.normalized_per_processor": 92941.1, "ck_system": "DSS8440_QuadroRTX8000x10_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.10GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 10, "normalize_processors": 10, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x10_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 58481610, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x10_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 886085, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "e9b9518493baadba", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32121920849, "90.00 percentile latency (ns)": 56755278004, "95.00 percentile latency (ns)": 59839407988, "97.00 percentile latency (ns)": 61074563847, "99.00 percentile latency (ns)": 62308338650, "99.90 percentile latency (ns)": 62862464627, "Max latency (ns)": 62923299305, "Mean latency (ns)": 32160580672, "Min duration satisfied": "Yes", "Min latency (ns)": 1565211109, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 929411, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 8000", "accelerator_on-chip_memories": "", "accelerators_per_node": 10, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 929411, "characteristics.samples_per_second.normalized_per_core": 92941.1, "characteristics.samples_per_second.normalized_per_processor": 92941.1, "ck_system": "DSS8440_QuadroRTX8000x10_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.10GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 10, "normalize_processors": 10, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_QuadroRTX8000x10_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 58481610, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_QuadroRTX8000x10_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 886085, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "593f9e46899abbc5", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32751639150, "90.00 percentile latency (ns)": 58481088041, "95.00 percentile latency (ns)": 61727304706, "97.00 percentile latency (ns)": 63017573141, "99.00 percentile latency (ns)": 64307480785, "99.90 percentile latency (ns)": 64887122090, "Max latency (ns)": 64948843078, "Mean latency (ns)": 32813253617, "Min duration satisfied": "Yes", "Min latency (ns)": 973052744, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 540127, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 16, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 540127, "characteristics.samples_per_second.normalized_per_core": 33757.9375, "characteristics.samples_per_second.normalized_per_processor": 33757.9375, "ck_system": "DSS8440_T4x16_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "ECC off", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 16, "normalize_processors": 16, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x16_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 35080650, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x16_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 531525, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "48e3cead7d84f97b", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32751639150, "90.00 percentile latency (ns)": 58481088041, "95.00 percentile latency (ns)": 61727304706, "97.00 percentile latency (ns)": 63017573141, "99.00 percentile latency (ns)": 64307480785, "99.90 percentile latency (ns)": 64887122090, "Max latency (ns)": 64948843078, "Mean latency (ns)": 32813253617, "Min duration satisfied": "Yes", "Min latency (ns)": 973052744, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 540127, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 16, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 540127, "characteristics.samples_per_second.normalized_per_core": 33757.9375, "characteristics.samples_per_second.normalized_per_processor": 33757.9375, "ck_system": "DSS8440_T4x16_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "384 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "ECC off", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 16, "normalize_processors": 16, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x16_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 35080650, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x16_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 531525, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "34e90f43c2950b74", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 69711170333, "90.00 percentile latency (ns)": 127573962996, "95.00 percentile latency (ns)": 134878214013, "97.00 percentile latency (ns)": 137794816013, "99.00 percentile latency (ns)": 140702938304, "99.90 percentile latency (ns)": 142003839888, "Max latency (ns)": 142152919965, "Mean latency (ns)": 70259221127, "Min duration satisfied": "Yes", "Min latency (ns)": 479162646, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 126287, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 126287, "characteristics.samples_per_second.normalized_per_core": 31571.75, "characteristics.samples_per_second.normalized_per_processor": 31571.75, "ck_system": "R7515_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.0.14, CUDA 11.0.207", "host_memory_capacity": "256 GB", "host_memory_configuration": "DDR-4", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.0GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 1, "host_storage_capacity": "3.2 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7515_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.27, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17952000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7515_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 120, "uid": "013b15cfea75f5b8", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 69711170333, "90.00 percentile latency (ns)": 127573962996, "95.00 percentile latency (ns)": 134878214013, "97.00 percentile latency (ns)": 137794816013, "99.00 percentile latency (ns)": 140702938304, "99.90 percentile latency (ns)": 142003839888, "Max latency (ns)": 142152919965, "Mean latency (ns)": 70259221127, "Min duration satisfied": "Yes", "Min latency (ns)": 479162646, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 126287, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 126287, "characteristics.samples_per_second.normalized_per_core": 31571.75, "characteristics.samples_per_second.normalized_per_processor": 31571.75, "ck_system": "R7515_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.0.14, CUDA 11.0.207", "host_memory_capacity": "256 GB", "host_memory_configuration": "DDR-4", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "2.0GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 1, "host_storage_capacity": "3.2 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7515_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.27, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17952000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7515_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 120, "uid": "f56c49a9912d0818", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 33215531305, "90.00 percentile latency (ns)": 59487499786, "95.00 percentile latency (ns)": 62791935455, "97.00 percentile latency (ns)": 64110694521, "99.00 percentile latency (ns)": 65438048802, "99.90 percentile latency (ns)": 66032655720, "Max latency (ns)": 66099488678, "Mean latency (ns)": 33292827619, "Min duration satisfied": "Yes", "Min latency (ns)": 759972368, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 399298, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 12, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 399298, "characteristics.samples_per_second.normalized_per_core": 33274.833333333336, "characteristics.samples_per_second.normalized_per_processor": 33274.833333333336, "ck_system": "DSS8440_T4x12_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 12, "normalize_processors": 12, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x12_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 26393400, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x12_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 399900, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "ea9651346ae85db4", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 33215531305, "90.00 percentile latency (ns)": 59487499786, "95.00 percentile latency (ns)": 62791935455, "97.00 percentile latency (ns)": 64110694521, "99.00 percentile latency (ns)": 65438048802, "99.90 percentile latency (ns)": 66032655720, "Max latency (ns)": 66099488678, "Mean latency (ns)": 33292827619, "Min duration satisfied": "Yes", "Min latency (ns)": 759972368, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 399298, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 12, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 399298, "characteristics.samples_per_second.normalized_per_core": 33274.833333333336, "characteristics.samples_per_second.normalized_per_processor": 33274.833333333336, "ck_system": "DSS8440_T4x12_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0 Update 1", "host_memory_capacity": "768 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1 TB", "host_storage_type": "NVMe SSD", "hw_notes": "ECC off", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 12, "normalize_processors": 12, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/DSS8440_T4x12_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2, CUDA 11.0 Update 1, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 26393400, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/DSS8440_T4x12_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 399900, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "7b0f634dedad464c", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 43151053037, "90.00 percentile latency (ns)": 77001533385, "95.00 percentile latency (ns)": 81229204115, "97.00 percentile latency (ns)": 82924017198, "99.00 percentile latency (ns)": 84612884827, "99.90 percentile latency (ns)": 85376656023, "Max latency (ns)": 85455622634, "Mean latency (ns)": 43156925384, "Min duration satisfied": "Yes", "Min latency (ns)": 851561488, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 322834, "Scenario": "offline", "accelerator_frequency": "1770MHz", "accelerator_host_interconnect": "PCI Express 3.0", "accelerator_interconnect": "PCI Express 3.0", "accelerator_interconnect_topology": "2 Accelerators per CPU", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 6000", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 322834, "characteristics.samples_per_second.normalized_per_core": 80708.5, "characteristics.samples_per_second.normalized_per_processor": 80708.5, "ck_system": "C4140_QuadroRTX6000x4_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "384 GB", "host_memory_configuration": "6x16GB DDR4-2666 HMA82GR7AFR8N-VK RDIMM ECC", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "1.25MB+20MB+27.5MB", "host_processor_core_count": 120, "host_processor_frequency": "2.40GHz", "host_processor_interconnect": "Ultra Path Interconnect", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.6 TB (1x1.6TB Dell Express Flash PM1725a 1.6TB AIC)", "host_storage_type": "3D-TLC Solid State with PCIe NVME x8 Interface", "hw_notes": "ECC off. RTX6000 is available as a special config thru Dell DSS or OEM for PowerEdge C4140", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/C4140_QuadroRTX6000x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 27588000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/C4140_QuadroRTX6000x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 418000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "efa45a898ff9045d", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 43151053037, "90.00 percentile latency (ns)": 77001533385, "95.00 percentile latency (ns)": 81229204115, "97.00 percentile latency (ns)": 82924017198, "99.00 percentile latency (ns)": 84612884827, "99.90 percentile latency (ns)": 85376656023, "Max latency (ns)": 85455622634, "Mean latency (ns)": 43156925384, "Min duration satisfied": "Yes", "Min latency (ns)": 851561488, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 322834, "Scenario": "offline", "accelerator_frequency": "1770MHz", "accelerator_host_interconnect": "PCI Express 3.0", "accelerator_interconnect": "PCI Express 3.0", "accelerator_interconnect_topology": "2 Accelerators per CPU", "accelerator_memory_capacity": "24 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 6000", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 322834, "characteristics.samples_per_second.normalized_per_core": 80708.5, "characteristics.samples_per_second.normalized_per_processor": 80708.5, "ck_system": "C4140_QuadroRTX6000x4_TRT", "ck_used": true, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "384 GB", "host_memory_configuration": "6x16GB DDR4-2666 HMA82GR7AFR8N-VK RDIMM ECC", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "1.25MB+20MB+27.5MB", "host_processor_core_count": 120, "host_processor_frequency": "2.40GHz", "host_processor_interconnect": "Ultra Path Interconnect", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "1.6 TB (1x1.6TB Dell Express Flash PM1725a 1.6TB AIC)", "host_storage_type": "3D-TLC Solid State with PCIe NVME x8 Interface", "hw_notes": "ECC off. RTX6000 is available as a special config thru Dell DSS or OEM for PowerEdge C4140", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/C4140_QuadroRTX6000x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 27588000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/C4140_QuadroRTX6000x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 418000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "5ae6852a44098b14", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32978793909, "90.00 percentile latency (ns)": 59223170368, "95.00 percentile latency (ns)": 62512750016, "97.00 percentile latency (ns)": 63828861951, "99.00 percentile latency (ns)": 65141605595, "99.90 percentile latency (ns)": 65736690439, "Max latency (ns)": 65797411732, "Mean latency (ns)": 33029154305, "Min duration satisfied": "Yes", "Min latency (ns)": 447340568, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 270418, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 8000", "accelerator_on-chip_memories": "", "accelerators_per_node": 3, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 270418, "characteristics.samples_per_second.normalized_per_core": 90139.33333333333, "characteristics.samples_per_second.normalized_per_processor": 90139.33333333333, "ck_system": "R7525_QuadroRTX8000x3_TRT", "ck_used": false, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 32, "host_processor_frequency": "2.50GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7502", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 3, "normalize_processors": 3, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT", "number_of_nodes": 1, "operating_system": "CentOS Linux release 8.1.1911", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17792808, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT", "system_name": "Dell EMC PowerEdge R7525 (3x Quadro RTX 8000)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 269588, "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "36ed7db8a24f1193", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 32978793909, "90.00 percentile latency (ns)": 59223170368, "95.00 percentile latency (ns)": 62512750016, "97.00 percentile latency (ns)": 63828861951, "99.00 percentile latency (ns)": 65141605595, "99.90 percentile latency (ns)": 65736690439, "Max latency (ns)": 65797411732, "Mean latency (ns)": 33029154305, "Min duration satisfied": "Yes", "Min latency (ns)": 447340568, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 270418, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "48 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA Quadro RTX 8000", "accelerator_on-chip_memories": "", "accelerators_per_node": 3, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 270418, "characteristics.samples_per_second.normalized_per_core": 90139.33333333333, "characteristics.samples_per_second.normalized_per_processor": 90139.33333333333, "ck_system": "R7525_QuadroRTX8000x3_TRT", "ck_used": false, "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2, CUDA 11.0, cuDNN 8.0.2, cuBLAS 11.2.0, libjemalloc2, cub 1.8.0, tensorrt-laboratory mlperf branch", "host_memory_capacity": "512 GB", "host_memory_configuration": "", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 32, "host_processor_frequency": "2.50GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7502", "host_processors_per_node": 2, "host_storage_capacity": "1.84 TB", "host_storage_type": "NVMe", "hw_notes": "", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 3, "normalize_processors": 3, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R7525_QuadroRTX8000x3_TRT", "number_of_nodes": 1, "operating_system": "CentOS Linux release 8.1.1911", "other_software_stack": "docker 19.03.12, python 3.6.8, gcc 5.5.0, onnx 1.3.0, tensorflow 1.13.1, pytorch 1.1.0, torchvision 0.3.0, pycuda 2019.1, sacrebleu 1.3.3, SimpleJSON, OpenCV 4.1.1", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17792808, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R7525_QuadroRTX8000x3_TRT", "system_name": "Dell EMC PowerEdge R7525 (3x Quadro RTX 8000)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 269588, "task": "recommendation", "task2": "recommendation", "total_cores": 64, "uid": "6d4498c9e82a9906", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 67692230543, "90.00 percentile latency (ns)": 122581521928, "95.00 percentile latency (ns)": 129513953277, "97.00 percentile latency (ns)": 132277257334, "99.00 percentile latency (ns)": 135054160515, "99.90 percentile latency (ns)": 136305904070, "Max latency (ns)": 136443109942, "Mean latency (ns)": 67943016621, "Min duration satisfied": "Yes", "Min latency (ns)": 583746856, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 131571, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 131571, "characteristics.samples_per_second.normalized_per_core": 32892.75, "characteristics.samples_per_second.normalized_per_processor": 32892.75, "ck_system": "R740_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.9, "formal_model_link": "", "framework": "TensorRT 7.2.0.14, CUDA 11.0.207", "host_memory_capacity": "384 GB", "host_memory_configuration": "DDR-4", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "3.0GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "3.84 TB", "host_storage_type": "SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99.9", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R740_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.207, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17952000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "dfe4b89ade2cbf4c", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" }, { "50.00 percentile latency (ns)": 67692230543, "90.00 percentile latency (ns)": 122581521928, "95.00 percentile latency (ns)": 129513953277, "97.00 percentile latency (ns)": 132277257334, "99.00 percentile latency (ns)": 135054160515, "99.90 percentile latency (ns)": 136305904070, "Max latency (ns)": 136443109942, "Mean latency (ns)": 67943016621, "Min duration satisfied": "Yes", "Min latency (ns)": 583746856, "Min queries satisfied": "Yes", "Mode": "Performance", "Result is": "VALID", "SUT name": "DLRM SERVER", "Samples per second": 131571, "Scenario": "offline", "accelerator_frequency": "", "accelerator_host_interconnect": "", "accelerator_interconnect": "", "accelerator_interconnect_topology": "", "accelerator_memory_capacity": "16 GB", "accelerator_memory_configuration": "GDDR6", "accelerator_model_name": "NVIDIA T4", "accelerator_on-chip_memories": "", "accelerators_per_node": 4, "accuracy_log_probability": 0, "accuracy_log_rng_seed": 0, "accuracy_log_sampling_target": 0, "characteristics.samples_per_second": 131571, "characteristics.samples_per_second.normalized_per_core": 32892.75, "characteristics.samples_per_second.normalized_per_processor": 32892.75, "ck_system": "R740_T4x4_TRT", "ck_used": true, "cooling": "", "dataset": "1TB Click Logs", "dataset_link": "", "dim_x_default": "characteristics.samples_per_second", "dim_x_maximize": true, "dim_y_default": "characteristics.AUC", "dim_y_maximize": true, "division": "closed", "formal_model": "dlrm", "formal_model_accuracy": 99.0, "formal_model_link": "", "framework": "TensorRT 7.2.0.14, CUDA 11.0.207", "host_memory_capacity": "384 GB", "host_memory_configuration": "DDR-4", "host_networking": "", "host_networking_topology": "", "host_processor_caches": "", "host_processor_core_count": 120, "host_processor_frequency": "3.0GHz", "host_processor_interconnect": "", "host_processor_model_name": "AMD EPYC 7V13 64-Core Processor", "host_processors_per_node": 2, "host_storage_capacity": "3.84 TB", "host_storage_type": "SSD", "hw_notes": "ECC on", "informal_model": "dlrm-99", "input_data_types": "int8", "max_async_queries": 1, "max_duration (ms)": 0, "max_query_count": 0, "min_duration (ms)": 60000, "min_query_count": 1, "mlperf_version": 0.7, "normalize_cores": 4, "normalize_processors": 4, "note_code": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/code", "note_details": "https://github.com/mlcommons/inference_results_v0.7/tree/master/closed/DellEMC/results/R740_T4x4_TRT", "number_of_nodes": 1, "operating_system": "Ubuntu 18.04.5 LTS (Linux-5.4.0-1055-azure-x86_64-with-Ubuntu-18.04-bionic)", "other_software_stack": "TensorRT 7.2.0.14, CUDA 11.0.207, cuDNN 8.0.2, DALI 0.25.0; GCC 7.5.0; Python 3.7.10", "performance_issue_same": true, "performance_issue_same_index": 0, "performance_issue_unique": true, "performance_sample_count": 204800, "print_timestamps": true, "problem": false, "qsl_rng_seed": 12786827339337101903, "retraining": "N", "sample_index_rng_seed": 12640797754436136668, "samples_per_query": 17952000, "schedule_rng_seed": 3135815929913719677, "starting_weights_filename": "tb00_40M.pt", "status": "available", "submitter": "DellEMC", "submitter_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.submitter/DellEMC", "sw_notes": "Powered by CK v2.5.8 (https://github.com/ctuning/ck)", "system_link": "https://github.com/ctuning/ck-mlperf-inference/tree/main/bench.mlperf.system/R740_T4x4_TRT", "system_name": "Microsoft Corporation 7.0 (Virtual Machine)", "system_type": "datacenter", "target_latency (ns)": 0, "target_qps": 272000, "task": "recommendation", "task2": "recommendation", "total_cores": 240, "uid": "d584b4c2df4e09ae", "use_accelerator": true, "weight_data_types": "int8,fp16", "weight_transformations": "quantization, affine fusion" } ]