diff --git a/benchmarks/config/omp/mlir-bf16.json b/benchmarks/config/omp/mlir-bf16.json index 4623eb29d..79e167c56 100644 --- a/benchmarks/config/omp/mlir-bf16.json +++ b/benchmarks/config/omp/mlir-bf16.json @@ -5,28 +5,28 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ "(avx2|asimd)" ] }, "bf16_dp2_3x1024_omp_4_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ "(avx2|asimd)" ] }, "bf16_dp2_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ "(avx2|asimd)" ] }, "bf16_dp2_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ "(avx2|asimd)" ] } }}, @@ -36,28 +36,28 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ "(avx2|asimd)" ] }, "bf16_dp2_3x1024_omp_4_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ "(avx2|asimd)" ] }, "bf16_dp2_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ "(avx2|asimd)" ] }, "bf16_dp2_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ "(avx2|asimd)" ] } }} diff --git a/benchmarks/config/omp/mlir-fp32.json b/benchmarks/config/omp/mlir-fp32.json index 894273e45..1ad34561f 100644 --- a/benchmarks/config/omp/mlir-fp32.json +++ b/benchmarks/config/omp/mlir-fp32.json @@ -5,28 +5,28 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_4_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ "(avx2|asimd)" ] } }}, @@ -36,28 +36,28 @@ "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_4_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_8_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ "(avx2|asimd)" ] }, "fp32_3x1024_omp_16_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-width=32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ "(avx2|asimd)" ] } }} diff --git a/benchmarks/config/omp/torch-dynamo.json b/benchmarks/config/omp/torch-dynamo.json index de384f908..4509c9ef5 100644 --- a/benchmarks/config/omp/torch-dynamo.json +++ b/benchmarks/config/omp/torch-dynamo.json @@ -5,28 +5,28 @@ "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ ] }, "fp32_3x1024_omp_4_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ ] }, "fp32_3x1024_omp_8_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ ] }, "fp32_3x1024_omp_16_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ ] } }}, @@ -36,28 +36,28 @@ "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ ] }, "bf16_3x1024_omp_4_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ ] }, "bf16_3x1024_omp_8_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ ] }, "bf16_3x1024_omp_16_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-gemm-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ ] } }}, @@ -67,28 +67,28 @@ "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ ] }, "fp32_3x1024_omp_4_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ ] }, "fp32_3x1024_omp_8_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ ] }, "fp32_3x1024_omp_16_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ ] } }}, @@ -98,28 +98,28 @@ "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16'" ], "extensions": [ ] }, "bf16_3x1024_omp_4_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8'" ], "extensions": [ ] }, "bf16_3x1024_omp_8_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8'" ], "extensions": [ ] }, "bf16_3x1024_omp_16_mlir": { "type": "MLIR", "benchmark": "pytorch/torch-dynamo-mlp-bf16-3x1024.mlir", "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, - "flags": [ "-n", "100", "-run-args='-def-parallel'" ], + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8'" ], "extensions": [ ] } }}