From 2c72dca725a99ff787009b8f8c85a0a38f11eb75 Mon Sep 17 00:00:00 2001 From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:19:25 +0000 Subject: [PATCH] Add tests --- .../tests/backend/vectorizer/dune.inc | 532 ++++++++++++++++++ .../tests/backend/vectorizer/gen/gen_dune.ml | 7 + .../backend/vectorizer/test_arrays.expected | 9 + .../tests/backend/vectorizer/test_arrays.ml | 141 +++++ .../tests/backend/vectorizer/test_arrays.mli | 1 + .../test_arrays_vectorized.cmx.dump.expected | 1 + .../backend/vectorizer/test_float.expected | 7 + .../tests/backend/vectorizer/test_float.ml | 75 +++ .../tests/backend/vectorizer/test_float.mli | 1 + .../vectorizer/test_float32_unboxed.expected | 5 + .../vectorizer/test_float32_unboxed.ml | 225 ++++++++ .../vectorizer/test_float32_unboxed.mli | 1 + ...oat32_unboxed_vectorized.cmx.dump.expected | 1 + .../vectorizer/test_float_unboxed.expected | 2 + .../backend/vectorizer/test_float_unboxed.ml | 80 +++ .../backend/vectorizer/test_float_unboxed.mli | 1 + ...float_unboxed_vectorized.cmx.dump.expected | 2 + .../test_float_vectorized.cmx.dump.expected | 7 + .../vectorizer/test_int32_unboxed.expected | 7 + .../backend/vectorizer/test_int32_unboxed.ml | 229 ++++++++ .../backend/vectorizer/test_int32_unboxed.mli | 1 + ...int32_unboxed_vectorized.cmx.dump.expected | 3 + .../backend/vectorizer/test_int64.expected | 7 + .../tests/backend/vectorizer/test_int64.ml | 79 +++ .../tests/backend/vectorizer/test_int64.mli | 1 + .../vectorizer/test_int64_unboxed.expected | 3 + .../backend/vectorizer/test_int64_unboxed.ml | 61 ++ .../backend/vectorizer/test_int64_unboxed.mli | 1 + ...int64_unboxed_vectorized.cmx.dump.expected | 3 + .../test_int64_vectorized.cmx.dump.expected | 3 + 30 files changed, 1496 insertions(+) create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_float.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_float.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_float.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected diff --git a/flambda-backend/tests/backend/vectorizer/dune.inc b/flambda-backend/tests/backend/vectorizer/dune.inc index 67a51f80bb5..cb368edfd8b 100644 --- a/flambda-backend/tests/backend/vectorizer/dune.inc +++ b/flambda-backend/tests/backend/vectorizer/dune.inc @@ -75,6 +75,538 @@ (action (diff test1_vectorized.expected test1_vectorized.output))) +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_arrays_runner.exe test_arrays.cmx.dump) + (deps test_arrays.mli test_arrays.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_arrays_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_arrays.output + (run ./test_arrays_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_arrays.expected test_arrays.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_arrays.ml test_arrays_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_arrays.mli test_arrays_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_arrays_vectorized_runner.exe test_arrays_vectorized.cmx.dump) + (deps test_arrays_vectorized.mli test_arrays_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_arrays_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_arrays_vectorized.cmx.dump.output) + (deps ./filter.sh test_arrays_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_arrays_vectorized.cmx.dump.expected test_arrays_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_arrays_vectorized.output + (run ./test_arrays_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_arrays.expected test_arrays_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_arrays_vectorized.expected test_arrays_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_unboxed_runner.exe test_int64_unboxed.cmx.dump) + (deps test_int64_unboxed.mli test_int64_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_int64_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64_unboxed.output + (run ./test_int64_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64_unboxed.expected test_int64_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64_unboxed.ml test_int64_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64_unboxed.mli test_int64_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_unboxed_vectorized_runner.exe test_int64_unboxed_vectorized.cmx.dump) + (deps test_int64_unboxed_vectorized.mli test_int64_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_int64_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_int64_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_int64_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_int64_unboxed_vectorized.cmx.dump.expected test_int64_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64_unboxed_vectorized.output + (run ./test_int64_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64_unboxed.expected test_int64_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64_unboxed_vectorized.expected test_int64_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_unboxed_runner.exe test_float_unboxed.cmx.dump) + (deps test_float_unboxed.mli test_float_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_float_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float_unboxed.output + (run ./test_float_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float_unboxed.expected test_float_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float_unboxed.ml test_float_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float_unboxed.mli test_float_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_unboxed_vectorized_runner.exe test_float_unboxed_vectorized.cmx.dump) + (deps test_float_unboxed_vectorized.mli test_float_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_float_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_float_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_float_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float_unboxed_vectorized.cmx.dump.expected test_float_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float_unboxed_vectorized.output + (run ./test_float_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float_unboxed.expected test_float_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float_unboxed_vectorized.expected test_float_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_runner.exe test_int64.cmx.dump) + (deps test_int64.mli test_int64.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_int64_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64.output + (run ./test_int64_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64.expected test_int64.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64.ml test_int64_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64.mli test_int64_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_vectorized_runner.exe test_int64_vectorized.cmx.dump) + (deps test_int64_vectorized.mli test_int64_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_int64_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_int64_vectorized.cmx.dump.output) + (deps ./filter.sh test_int64_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_int64_vectorized.cmx.dump.expected test_int64_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64_vectorized.output + (run ./test_int64_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64.expected test_int64_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64_vectorized.expected test_int64_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_runner.exe test_float.cmx.dump) + (deps test_float.mli test_float.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_float_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float.output + (run ./test_float_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float.expected test_float.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float.ml test_float_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float.mli test_float_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_vectorized_runner.exe test_float_vectorized.cmx.dump) + (deps test_float_vectorized.mli test_float_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_float_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_float_vectorized.cmx.dump.output) + (deps ./filter.sh test_float_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float_vectorized.cmx.dump.expected test_float_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float_vectorized.output + (run ./test_float_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float.expected test_float_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float_vectorized.expected test_float_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float32_unboxed_runner.exe test_float32_unboxed.cmx.dump) + (deps test_float32_unboxed.mli test_float32_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_float32_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float32_unboxed.output + (run ./test_float32_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float32_unboxed.expected test_float32_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float32_unboxed.ml test_float32_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float32_unboxed.mli test_float32_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float32_unboxed_vectorized_runner.exe test_float32_unboxed_vectorized.cmx.dump) + (deps test_float32_unboxed_vectorized.mli test_float32_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_float32_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_float32_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_float32_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float32_unboxed_vectorized.cmx.dump.expected test_float32_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float32_unboxed_vectorized.output + (run ./test_float32_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float32_unboxed.expected test_float32_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float32_unboxed_vectorized.expected test_float32_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int32_unboxed_runner.exe test_int32_unboxed.cmx.dump) + (deps test_int32_unboxed.mli test_int32_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_int32_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int32_unboxed.output + (run ./test_int32_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int32_unboxed.expected test_int32_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int32_unboxed.ml test_int32_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int32_unboxed.mli test_int32_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int32_unboxed_vectorized_runner.exe test_int32_unboxed_vectorized.cmx.dump) + (deps test_int32_unboxed_vectorized.mli test_int32_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_int32_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_int32_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_int32_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_int32_unboxed_vectorized.cmx.dump.expected test_int32_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int32_unboxed_vectorized.output + (run ./test_int32_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int32_unboxed.expected test_int32_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int32_unboxed_vectorized.expected test_int32_unboxed_vectorized.output))) + (rule (alias runtest) (enabled_if (= %{context_name} "main")) diff --git a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml index 53062d52b9f..4a2e903dbe3 100644 --- a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml +++ b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml @@ -170,6 +170,13 @@ let print_test ?(filter_exit_code = 0) name = let () = print_test "test1"; + print_test "test_arrays"; + print_test "test_int64_unboxed"; + print_test "test_float_unboxed"; + print_test "test_int64"; + print_test "test_float"; + print_test "test_float32_unboxed"; + print_test "test_int32_unboxed"; (* can't vectorize *) print_test ~filter_exit_code:1 "test_register_compatible"; () diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.expected b/flambda-backend/tests/backend/vectorizer/test_arrays.expected new file mode 100644 index 00000000000..e86cd1806ce --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays.expected @@ -0,0 +1,9 @@ +add_arrays_unrolled_manually 17 18 19 20 21 22 23 24 25 26 +add_arrays_unrolled_safe 17 18 19 20 21 22 23 24 25 26 +add_arrays_rec_unrolled_attribute 17 18 19 20 21 22 23 24 25 26 +add_arrays_for 17 18 19 20 21 22 23 24 25 26 +add_arrays_rec 17 18 19 20 21 22 23 24 25 26 +initialize_array_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 +initialize_arrays_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 +initialize_array_unrolled_manually 17 17 17 17 17 17 17 17 17 17 +initialize_floatarray_unrolled_manually 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.ml b/flambda-backend/tests/backend/vectorizer/test_arrays.ml new file mode 100644 index 00000000000..fab21fe1453 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays.ml @@ -0,0 +1,141 @@ +let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_manually + a b c n = + for i = 0 to (n / 2) - 1 do + Array.unsafe_set c (i * 2) + (Array.unsafe_get a (i * 2) + Array.unsafe_get b (i * 2)); + Array.unsafe_set c + ((i * 2) + 1) + (Array.unsafe_get a ((i * 2) + 1) + Array.unsafe_get b ((i * 2) + 1)) + done; + if Int.rem n 2 = 1 + then + Array.unsafe_set c (n - 1) + (Array.unsafe_get a (n - 1) + Array.unsafe_get b (n - 1)) + +(* Currently won't be vectorized. Can vectorize it but it's not worth it + according to our cost model. It will be vectorized when we add vectors beyond + 128 or arrays of elements smaller than 64-bit. *) +let[@inline never] [@local never] [@specialize never] initialize_array_const_unrolled_manually + arr n = + let i = ref 0 in + while !i < n do + Array.unsafe_set arr !i 0; + Array.unsafe_set arr (!i + 1) 0; + i := !i + 2 + done + +(* Currently, won't be vectorized. If different groups can reuse the new + register that holds the constants, this will be worth vectorizing even with + 128-bit vectors. *) +let[@inline never] [@local never] [@specialize never] initialize_arrays_const_unrolled_manually + a b c n = + let i = ref 0 in + while !i < n do + Array.unsafe_set a !i 0; + Array.unsafe_set a (!i + 1) 0; + Array.unsafe_set b !i 0; + Array.unsafe_set b (!i + 1) 0; + Array.unsafe_set c !i 0; + Array.unsafe_set c (!i + 1) 0; + i := !i + 2 + done + +(* Currently, won't be vectorized. Shuffling values into a vector is not yet + supported, only vector loads are. Also not worth it unless the shuffle is + outside the loop (loop invariant detection/motion would be needed for it). *) +let[@inline never] [@local never] [@specialize never] initialize_array_unrolled_manually + arr n (v : int) = + let i = ref 0 in + while !i < n do + Array.unsafe_set arr !i v; + Array.unsafe_set arr (!i + 1) v; + i := !i + 2 + done + +(* same as [initialize_array_unrolled_manually] except needs movddup. *) +let[@inline never] [@local never] [@specialize never] initialize_floatarray_unrolled_manually + arr n (v : float) = + let i = ref 0 in + while !i < n do + Array.unsafe_set arr !i v; + Array.unsafe_set arr (!i + 1) v; + i := !i + 2 + done + +(* cannot vectorize across basic blocks *) +let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_safe a + b c n = + for i = 0 to n - 1 do + Array.set c (i * 2) (Array.get a (i * 2) + Array.get b (i * 2)); + Array.set c + ((i * 2) + 1) + (Array.get a ((i * 2) + 1) + Array.get b ((i * 2) + 1)) + done + +(* cannot vectorize across basic blocks. unroll attribute is not sufficient to + eliminate the loop condition from the unrolled body (e.g., we would need to + track the fact that the bound is even. *) +let[@inline never] [@local never] [@specialize never] add_arrays_rec_unrolled_attribute + a b c n = + let[@loop never] rec loop i a b c n = + if i < n + then ( + Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i); + (loop [@unrolled 1]) (i + 1) a b c n) + in + loop 0 a b c (2 * n) + +(* cannot vectorizer for-loops *) +let[@inline never] [@local never] [@specialize never] add_arrays_for a b c n = + for i = 0 to n - 1 do + Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i) + done + +(* cannot vectorizer loops expressed using recursion *) +let[@inline never] [@local never] [@specialize never] add_arrays_rec a b c n = + let rec loop i = + if i < n + then ( + Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i); + loop (i + 1)) + in + loop 0 + +let print_array ppf a = + let count = Array.length a in + for i = 0 to count - 1 do + Format.fprintf ppf "%d " a.(i) + done + +let print_floatarray ppf a = + let count = Array.length a in + for i = 0 to count - 1 do + Format.fprintf ppf "%f " a.(i) + done + +let () = + let n = Sys.opaque_identity 10 in + let a = Array.init n (fun i -> i) in + let b = Array.make n 17 in + let c = Array.make n 0 in + let d = Array.make n 0.0 in + add_arrays_unrolled_manually a b c (Sys.opaque_identity n); + Format.printf "add_arrays_unrolled_manually %a\n" print_array c; + add_arrays_unrolled_safe a b c (Sys.opaque_identity (n / 2)); + Format.printf "add_arrays_unrolled_safe %a\n" print_array c; + add_arrays_rec_unrolled_attribute a b c (n / 2); + Format.printf "add_arrays_rec_unrolled_attribute %a\n" print_array c; + add_arrays_for a b c n; + Format.printf "add_arrays_for %a\n" print_array c; + add_arrays_rec a b c n; + Format.printf "add_arrays_rec %a\n" print_array c; + initialize_array_const_unrolled_manually c n; + Format.printf "initialize_array_const_unrolled_manually %a\n" print_array c; + initialize_arrays_const_unrolled_manually a b c n; + Format.printf "initialize_arrays_const_unrolled_manually %a\n" print_array c; + initialize_array_unrolled_manually c n (Sys.opaque_identity 17); + Format.printf "initialize_array_unrolled_manually %a\n" print_array c; + initialize_floatarray_unrolled_manually d n (Sys.opaque_identity 7.7); + Format.printf "initialize_floatarray_unrolled_manually %a\n" print_floatarray + d; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.mli b/flambda-backend/tests/backend/vectorizer/test_arrays.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..182c1cc7309 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected @@ -0,0 +1 @@ +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 7 vector instructions, cost = -1 (Test_arrays_vectorized.add_arrays_unrolled_manually) diff --git a/flambda-backend/tests/backend/vectorizer/test_float.expected b/flambda-backend/tests/backend/vectorizer/test_float.expected new file mode 100644 index 00000000000..00ffe66d5e1 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float.expected @@ -0,0 +1,7 @@ +add_mutable_record { d0 = 88.000000 ; d1 = 110.000000 } +copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000 } +add_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 } +copy_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 } +add_mutable_record_t4 { d0 = 88.000000 ; d1 = 110.000000; d2 = 88.000000 ; d3 = 110.000000 } +copy_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 80.000000 ; d3 = 14.000000 } +dup_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 8.000000 ; d3 = 96.000000 } diff --git a/flambda-backend/tests/backend/vectorizer/test_float.ml b/flambda-backend/tests/backend/vectorizer/test_float.ml new file mode 100644 index 00000000000..1e36c686ceb --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float.ml @@ -0,0 +1,75 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +type t1 = + { mutable d0 : float; + mutable d1 : float + } + +let[@inline never] [@local never] [@specialize never] add_mutable_record + (a : t1) (b : t1) (c : t1) : t1 = + c.d0 <- Float.add a.d0 b.d0; + c.d1 <- Float.add a.d1 b.d1; + c + +let[@inline never] [@local never] [@specialize never] copy_mutable_record + (a : t1) (b : t1) : t1 = + b.d0 <- a.d0; + b.d1 <- a.d1; + b + +let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh + (a : t1) (b : t1) : t1 = + { d0 = Float.add a.d0 b.d0; d1 = Float.add a.d1 b.d1 } + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh + (a : t1) : t1 = + { d0 = a.d0; d1 = a.d1 } + +type t4 = + { mutable d0 : float; + mutable d1 : float; + mutable d2 : float; + mutable d3 : float + } + +let[@inline never] [@local never] [@specialize never] add_mutable_record_t4 + (a : t1) (b : t1) (c : t4) : t4 = + c.d0 <- Float.add a.d0 b.d0; + c.d1 <- Float.add a.d1 b.d1; + c.d2 <- Float.add a.d0 b.d0; + c.d3 <- Float.add a.d1 b.d1; + c + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4 + (a : t1) (b : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 } + +let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4 + (a : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 } + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %f ; d1 = %f }" t1.d0 t1.d1 + +let print_t4 ppf (t4 : t4) = + Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }" t4.d0 t4.d1 + t4.d2 t4.d3 + +let () = + let a = { d0 = 8.; d1 = 96. } in + let b = { d0 = 80.; d1 = 14. } in + let c = { d0 = 10.; d1 = -10. } in + let t4 = { d0 = 10.; d1 = -10.; d2 = 199.; d3 = 18. } in + let res = { d0 = 0.; d1 = -0. } in + Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c); + Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res); + Format.printf "add_mutable_record_fresh %a\n" print_t1 + (add_mutable_record_fresh a b); + Format.printf "copy_mutable_record_fresh %a\n" print_t1 + (copy_mutable_record_fresh c); + Format.printf "add_mutable_record_t4 %a\n" print_t4 + (add_mutable_record_t4 a b t4); + Format.printf "copy_mutable_record_t4 %a\n" print_t4 + (copy_mutable_record_t4 a b); + Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a); + () diff --git a/flambda-backend/tests/backend/vectorizer/test_float.mli b/flambda-backend/tests/backend/vectorizer/test_float.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected new file mode 100644 index 00000000000..92c4b798f9d --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected @@ -0,0 +1,5 @@ +add_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. } +copy_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. } +copy_bytes 10. 10. 10. 10. +copy_bytes_pos 10. 10. 10. 10. +copy_bytes_pos_v2 10. 10. 10. 10. diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml new file mode 100644 index 00000000000..ea552f169e2 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml @@ -0,0 +1,225 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Float32 = struct + type t = float32 + + external add : (t[@local_opt]) -> (t[@local_opt]) -> (t[@local_opt]) + = "%addfloat32" + + external format : string -> t -> string = "caml_format_float32" + + let to_string f = Stdlib.valid_float_lexem (format "%.9g" f) + + module Bytes = struct + external get : bytes -> pos:int -> float32 = "%caml_bytes_getf32" + external unsafe_get : bytes -> pos:int -> float32 = "%caml_bytes_getf32u" + external set : bytes -> pos:int -> float32 -> unit = "%caml_bytes_setf32" + + external unsafe_set : bytes -> pos:int -> float32 -> unit + = "%caml_bytes_setf32u" + end +end + +module Float32_u = struct + type t = float32# + + external to_float32 : t -> (float32[@local_opt]) = "%box_float32" [@@warning "-187"] + + external of_float32 : (float32[@local_opt]) -> t = "%unbox_float32" [@@warning "-187"] + + let[@inline always] add x y = of_float32 (Float32.add (to_float32 x) (to_float32 y)) + + module Bytes = struct + let get bytes ~pos = of_float32 (Float32.Bytes.get bytes ~pos) + let unsafe_get bytes ~pos = of_float32 (Float32.Bytes.unsafe_get bytes ~pos) + let set bytes ~pos x = Float32.Bytes.set bytes ~pos (to_float32 x) + let unsafe_set bytes ~pos x = Float32.Bytes.unsafe_set bytes ~pos (to_float32 x) + end +end + +type t1 = { mutable d0 : float32# ; + mutable d1: float32#; mutable d2: float32#; mutable d3: float32# } + +(* Not vectorized because float32 fields are not adjacent in a record, they are padded +to 64-bits. *) +let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit = + b.d0 <- a.d0; + b.d1 <- a.d1; + b.d2 <- a.d2; + b.d3 <- a.d3; + () + +(* Not vectorized because float32 fields are not adjacent in a record, they are padded +to 64-bits. *) +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Float32_u.add a.d0 b.d0; + c.d1 <- Float32_u.add a.d1 b.d1; + c.d2 <- Float32_u.add a.d2 b.d2; + c.d3 <- Float32_u.add a.d3 b.d3; + c + +(* [Float32_u.Bytes] contain packed float32_u, can vectorize. *) +let[@inline never] [@local never] [@specialize never] copy_bytes a b = + let pos = 0 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + () + +let[@inline never] [@local never] [@specialize never] init_bytes b x = + let pos = 0 in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + Float32_u.Bytes.unsafe_set b ~pos x; + () + +let[@inline always] copy_float32_unboxed_pos a b ~pos = + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + () + +(* Currently can't vectorize because [pos] untagging is repeated and the current + heuristic for detecting relations between pointers is not strong enough to + handle this case. *) +let[@inline never] [@local never] [@specialize never] copy_bytes_pos a b pos = + copy_float32_unboxed_pos a b ~pos; + copy_float32_unboxed_pos a b ~pos:(pos+1*4); + copy_float32_unboxed_pos a b ~pos:(pos+2*4); + copy_float32_unboxed_pos a b ~pos:(pos+3*4); + () + +(* 128: + * (id:3) a:V/61 := R:I/0[%rax] + * (id:4) b:V/62 := R:I/1[%rbx] + * (id:5) pos:I/63 := R:I/2[%rdi] + * (id:6) prim:I/64 := pos:I/63 + * (id:7) prim:I/64 := prim:I/64 >>s 1 + * (id:8) S/65 := float32 mut[a:V/61 + prim:I/64] + * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign) + * (id:10) Pbytes_set_f32:I/66 := 1 + * (id:11) I/67 := pos:I/63 + * (id:12) I/67 := I/67 + 8 + * (id:13) prim:I/68 := I/67 + * (id:14) prim:I/68 := prim:I/68 >>s 1 + * (id:15) S/69 := float32 mut[a:V/61 + prim:I/68] + * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign) + * (id:17) Pbytes_set_f32:I/70 := 1 + * (id:18) I/71 := pos:I/63 + * (id:19) I/71 := I/71 + 16 + * (id:20) prim:I/72 := I/71 + * (id:21) prim:I/72 := prim:I/72 >>s 1 + * (id:22) S/73 := float32 mut[a:V/61 + prim:I/72] + * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign) + * (id:24) Pbytes_set_f32:I/74 := 1 + * (id:25) I/75 := pos:I/63 + * (id:26) I/75 := I/75 + 24 + * (id:27) prim:I/76 := I/75 + * (id:28) prim:I/76 := prim:I/76 >>s 1 + * (id:29) S/77 := float32 mut[a:V/61 + prim:I/76] + * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign) + * (id:31) Pbytes_set_f32:I/78 := 1 + * (id:32) I/79 := 1 + * (id:33) R:I/0[%rax] := I/79 + * (id:34) Return R:I/0[%rax] *) + +(* Currently, can't vectorize because the index is untagged before every memory access, + instead of operating on untagged indexes throughout. *) +let[@inline never] [@local never] [@specialize never] copy_bytes_pos_v2 a b pos = + let i0 = pos in + copy_float32_unboxed_pos a b ~pos:i0; + let i1 = i0 + 4 in + copy_float32_unboxed_pos a b ~pos:i1; + let i2 = i1 + 4 in + copy_float32_unboxed_pos a b ~pos:i2; + let i3 = i2 + 4 in + copy_float32_unboxed_pos a b ~pos:i3; + () + +(* 177: + * (id:3) a:V/61 := R:I/0[%rax] + * (id:4) b:V/62 := R:I/1[%rbx] + * (id:5) pos:I/63 := R:I/2[%rdi] + * (id:6) prim:I/64 := pos:I/63 + * (id:7) prim:I/64 := prim:I/64 >>s 1 + * (id:8) S/65 := float32 mut[a:V/61 + prim:I/64] + * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign) + * (id:10) Pbytes_set_f32:I/66 := 1 + * (id:11) i1:I/67 := pos:I/63 + * (id:12) i1:I/67 := i1:I/67 + 8 + * (id:13) prim:I/68 := i1:I/67 + * (id:14) prim:I/68 := prim:I/68 >>s 1 + * (id:15) S/69 := float32 mut[a:V/61 + prim:I/68] + * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign) + * (id:17) Pbytes_set_f32:I/70 := 1 + * (id:18) i2:I/71 := i1:I/67 + * (id:19) i2:I/71 := i2:I/71 + 8 + * (id:20) prim:I/72 := i2:I/71 + * (id:21) prim:I/72 := prim:I/72 >>s 1 + * (id:22) S/73 := float32 mut[a:V/61 + prim:I/72] + * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign) + * (id:24) Pbytes_set_f32:I/74 := 1 + * (id:25) I/75 := i2:I/71 + * (id:26) I/75 := I/75 + 8 + * (id:27) prim:I/76 := I/75 + * (id:28) prim:I/76 := prim:I/76 >>s 1 + * (id:29) S/77 := float32 mut[a:V/61 + prim:I/76] + * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign) + * (id:31) Pbytes_set_f32:I/78 := 1 + * (id:32) I/79 := 1 + * (id:33) R:I/0[%rax] := I/79 + * (id:34) Return R:I/0[%rax] *) + + +let print_t1 ppf (t1 : t1) = + (* CR gyorsh: how to print Float32? *) + let to_string f = (Float32_u.to_float32 f |> Float32.to_string) in + Format.fprintf ppf "{ d0 = %s ; d1 = %s; d2 = %s ; d3 = %s }" + (to_string t1.d0) + (to_string t1.d1) + (to_string t1.d2) + (to_string t1.d3) + +let create_s length = + String.init length (fun i -> i * 7 mod 256 |> char_of_int) +;; + +let create_b length = create_s length |> Bytes.of_string + +let print_b ~len ppf b = + for i = 0 to len-1 do + Format.fprintf ppf "%s " + (Float32_u.to_float32 (Float32_u.Bytes.get b ~pos:(i*4)) |> Float32.to_string) + done + +let () = + let a = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.5s } in + let b = { d0 = #80.s; d1 = #14.s; d2 = #0.s; d3 = -#0.5s } in + let c = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.s } in + let res = { d0 = #0.s; d1 = -#10.s; d2 = #1.s; d3 = -#1.s } in + Format.printf "add_unboxed_pairs_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + copy_mutable_record c res; + Format.printf "copy_unboxed_pairs_mutable_record %a\n" print_t1 res; + let b1 = create_b 16 in + let b2 = create_b 16 in + init_bytes b1 #10.s; + init_bytes b2 #0.s; + copy_bytes b1 b2; + Format.printf "copy_bytes %a\n" (print_b ~len:4) b2; + copy_bytes_pos b2 b1 (Sys.opaque_identity 0); + Format.printf "copy_bytes_pos %a\n" (print_b ~len:4) b2; + copy_bytes_pos_v2 b1 b2 (Sys.opaque_identity 0); + Format.printf "copy_bytes_pos_v2 %a\n" (print_b ~len:4) b2; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..3178ac03fb8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1 @@ +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_float32_unboxed_vectorized.copy_bytes) diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected new file mode 100644 index 00000000000..bfea42ed769 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected @@ -0,0 +1,2 @@ +add_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 0.000000 ; d3 = -1.000000 } +copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 1.000000 ; d3 = -1.000000 } diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml new file mode 100644 index 00000000000..a49aaf0b841 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml @@ -0,0 +1,80 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Float_u = struct + type t = float# + + external to_float : t -> (float[@local_opt]) = "%box_float" [@@warning "-187"] + + external of_float : (float[@local_opt]) -> t = "%unbox_float" [@@warning "-187"] + + let[@inline always] add x y = of_float (Float.add (to_float x) (to_float y)) +end + +type t1 = { mutable d0: float#; + mutable d1: float#; + mutable d2: float#; + mutable d3: float# + } + + +let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit = + b.d0 <- a.d0; + b.d1 <- a.d1; + () + +(* Currently, can't vectorize because of the specific floatmem operation (looks like + it is treated overly conservatively. *) +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Float_u.add a.d0 b.d0; + c.d1 <- Float_u.add a.d1 b.d1; + c.d2 <- Float_u.add a.d2 b.d2; + c.d3 <- Float_u.add a.d3 b.d3; + c + +(* +102: +(id:3) a:V/61 := R:I/0[%rax] +(id:4) b:V/62 := R:I/1[%rbx] +(id:5) c:V/63 := R:I/2[%rdi] +(id:6) F/64 := float64 mut[a:V/61] +(id:7) F/65 := F/64 +(id:8) F/65 := F/65 +f float64[b:V/62] +(id:9) float64[c:V/63] := F/65 (assign) +(id:10) Psetufloatfield:I/66 := 1 +(id:11) F/67 := float64 mut[a:V/61 + 8] +(id:12) F/68 := F/67 +(id:13) F/68 := F/68 +f float64[b:V/62 + 8] +(id:14) float64[c:V/63 + 8] := F/68 (assign) +(id:15) Psetufloatfield:I/69 := 1 +(id:16) F/70 := float64 mut[a:V/61 + 16] +(id:17) F/71 := F/70 +(id:18) F/71 := F/71 +f float64[b:V/62 + 16] +(id:19) float64[c:V/63 + 16] := F/71 (assign) +(id:20) Psetufloatfield:I/72 := 1 +(id:21) F/73 := float64 mut[a:V/61 + 24] +(id:22) F/74 := F/73 +(id:23) F/74 := F/74 +f float64[b:V/62 + 24] +(id:24) float64[c:V/63 + 24] := F/74 (assign) +(id:25) Psetufloatfield:I/75 := 1 +(id:26) R:I/0[%rax] := c:V/63 +(id:27) Return R:I/0[%rax] + +*) + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }" + (Float_u.to_float t1.d0) + (Float_u.to_float t1.d1) + (Float_u.to_float t1.d2) + (Float_u.to_float t1.d3) + +let () = + let a = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0.5 } in + let b = { d0 = #80.; d1 = #14.; d2 = #0.; d3 = -#0.5 } in + let c = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0. } in + let res = { d0 = #0.; d1 = -#10.; d2 = #1.; d3 = -#1. } in + Format.printf "add_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + copy_mutable_record c res; + Format.printf "copy_mutable_record %a\n" print_t1 res; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..357dba19d99 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1,2 @@ +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_unboxed_vectorized.copy_mutable_record) +**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_unboxed_vectorized.add_mutable_record) diff --git a/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..dc486848738 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected @@ -0,0 +1,7 @@ +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record) +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record_fresh) +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record_fresh) +**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_vectorized.add_mutable_record_t4) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.copy_mutable_record_t4) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.dup_mutable_record_t4) diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected new file mode 100644 index 00000000000..0207ed6b915 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected @@ -0,0 +1,7 @@ +add_mutable_record { d0 = 88 ; d1 = 110; d2 = -40 ; d3 = -100 } +copy_array_four 30 30 30 30 +copy_array_index_four 30 30 30 30 +add_array_from_start 60 60 60 60 +copy_array_index_from_start 60 60 60 60 +copy_array_from_start 60 60 60 60 +copy_array_from_start_v2 60 60 60 60 diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml new file mode 100644 index 00000000000..b45eaa57769 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml @@ -0,0 +1,229 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Int32_u = struct + type t = int32# + + external to_int32 : t -> (int32[@local_opt]) = "%box_int32" [@@warning "-187"] + + external of_int32 : (int32[@local_opt]) -> t = "%unbox_int32" [@@warning "-187"] + + let[@inline always] add x y = of_int32 (Int32.add (to_int32 x) (to_int32 y)) + + module Array = struct + external unsafe_create : ('a : bits32). int -> 'a array = + "caml_make_unboxed_int32_vect_bytecode" "caml_make_unboxed_int32_vect" + external unsafe_get: ('a : bits32). 'a array -> int -> 'a = "%array_unsafe_get" + external unsafe_set: ('a : bits32). 'a array -> int -> 'a -> unit = "%array_unsafe_set" + + module Index = struct + external unsafe_get + : ('a : bits32). + ('a array) -> t -> 'a + = "%array_unsafe_get_indexed_by_int32#" + + external unsafe_set + : ('a : bits32). + 'a array -> t -> 'a -> unit + = "%array_unsafe_set_indexed_by_int32#" + end + end + +end + +type t1 = { mutable d0 : int32# ; mutable d1: int32#; mutable d2: int32#; mutable d3: int32# } + +(* Currently, can't vectorize because not adjacent and have an unnecessary sign extension. *) +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Int32_u.add a.d0 b.d0; + c.d1 <- Int32_u.add a.d1 b.d1; + c.d2 <- Int32_u.add a.d2 b.d2; + c.d3 <- Int32_u.add a.d3 b.d3; + c + +let[@inline always] copy_array_one (a : Int32_u.t array) + (b : Int32_u.t array) pos = + let x = Int32_u.Array.unsafe_get a pos in + Int32_u.Array.unsafe_set b pos x + +(* The accesses are adjacent but the use of [int] typed index results in a convoluted + index computation that is not yet handled by the current heuristics. *) +let[@inline never] [@local never][@specialize never] copy_array_four (a : Int32_u.t array) + (b : Int32_u.t array) ~pos = + copy_array_one a b pos; + copy_array_one a b (pos+1); + copy_array_one a b (pos+2); + copy_array_one a b (pos+3); + () + +(* + +114: +(id:3) a:V/61 := R:I/0[%rax] +(id:4) b:V/62 := R:I/1[%rbx] +(id:5) pos:I/63 := R:I/2[%rdi] +(id:6) new_value:I/64 := signed int32 mut[a:V/61 + pos:I/63 * 2 + 6] +(id:7) signed int32[b:V/62 + pos:I/63 * 2 + 6] := new_value:I/64 (assign) +(id:8) Parraysetu:I/65 := 1 +(id:9) Paddint:I/66 := pos:I/63 +(id:10) Paddint:I/66 := Paddint:I/66 + 2 +(id:11) new_value:I/67 := signed int32 mut[a:V/61 + Paddint:I/66 * 2 + 6] +(id:12) signed int32[b:V/62 + Paddint:I/66 * 2 + 6] := new_value:I/67 (assign) +(id:13) Parraysetu:I/68 := 1 +(id:14) Paddint:I/69 := pos:I/63 +(id:15) Paddint:I/69 := Paddint:I/69 + 4 +(id:16) new_value:I/70 := signed int32 mut[a:V/61 + Paddint:I/69 * 2 + 6] +(id:17) signed int32[b:V/62 + Paddint:I/69 * 2 + 6] := new_value:I/70 (assign) +(id:18) Parraysetu:I/71 := 1 +(id:19) Paddint:I/72 := pos:I/63 +(id:20) Paddint:I/72 := Paddint:I/72 + 6 +(id:21) new_value:I/73 := signed int32 mut[a:V/61 + Paddint:I/72 * 2 + 6] +(id:22) signed int32[b:V/62 + Paddint:I/72 * 2 + 6] := new_value:I/73 (assign) +(id:23) Parraysetu:I/74 := 1 +(id:24) I/75 := 1 +(id:25) R:I/0[%rax] := I/75 +(id:26) Return R:I/0[%rax] + +*) + +let[@inline never] [@local never][@specialize never] copy_array_four_v2 (a : Int32_u.t array) + (b : Int32_u.t array) ~pos = + let i0 = pos in + copy_array_one a b i0; + let i1 = i0 + 1 in + copy_array_one a b i1; + let i2 = i1 + 1 in + copy_array_one a b i2; + let i3 = i2 + 1 in + copy_array_one a b i3; + () + +let[@inline always] copy_array_index_one (a : Int32_u.t array) + (b : Int32_u.t array) (pos : Int32_u.t) = + let x = Int32_u.Array.Index.unsafe_get a pos in + Int32_u.Array.Index.unsafe_set b pos x + +(* Can't vectorize it! The accesses are adjacent and we use [Int32_u.t] as index, + but the compiler tags the index before using it! This index computation is not + yet handled by the vectorizer's heuristics. *) +let[@inline never] [@local never][@specialize never] copy_array_index_four (a : Int32_u.t array) + (b : Int32_u.t array) ~pos = + copy_array_index_one a b pos; + copy_array_index_one a b (Int32_u.add pos #1l); + copy_array_index_one a b (Int32_u.add pos #2l); + copy_array_index_one a b (Int32_u.add pos #3l); + () + +let[@inline never] [@local never][@specialize never] copy_array_index_from_start (a : Int32_u.t array) + (b : Int32_u.t array) = + let pos = #0l in + copy_array_index_one a b pos; + copy_array_index_one a b (Int32_u.add pos #1l); + copy_array_index_one a b (Int32_u.add pos #2l); + copy_array_index_one a b (Int32_u.add pos #3l); + () + + let[@inline never] [@local never][@specialize never] copy_array_from_start (a : Int32_u.t array) + (b : Int32_u.t array) = + let[@inline always] copy pos = + let x = Int32_u.Array.unsafe_get a pos in + Int32_u.Array.unsafe_set b pos x + in + let pos = 0 in + copy pos; + copy (pos+1); + copy (pos+2); + copy (pos+3); + () + +(* Can't vectorize because of an unnecessary sign extension. The heuristics in the + vectorizer can be extended to handle this case. *) +let[@inline never] [@local never][@specialize never] add_array_from_start (a : Int32_u.t array) (b : Int32_u.t array) = + let[@inline always] add pos = + let x = Int32_u.Array.unsafe_get a pos in + let y = Int32_u.Array.unsafe_get b pos in + Int32_u.Array.unsafe_set b pos (Int32_u.add x y) + in + let pos = 0 in + add pos; + add (pos+1); + add (pos+2); + add (pos+3); + () + +(* +camlTest7__add_array_from_start_7_22_code(R:I/0[%rax] R:I/1[%rbx]) {test7.ml:112,74-379} + a:V/61 := R:I/0[%rax] + b:V/62 := R:I/1[%rbx] + I/63 := signed int32 mut[b:V/62 + 8]{test7.ml:119,2-9;test7.ml:115,12-42} + I/64 := signed int32 mut[a:V/61 + 8]{test7.ml:119,2-9;test7.ml:114,12-42} + I/65 := I/64 + I/65 := I/65 + I/63{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/66 := sextend32 I/65{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 8] := new_value:I/66 (assign){test7.ml:119,2-9;test7.ml:116,4-52} + Parraysetu:I/67 := 1 + I/68 := signed int32 mut[b:V/62 + 12]{test7.ml:120,2-13;test7.ml:115,12-42} + I/69 := signed int32 mut[a:V/61 + 12]{test7.ml:120,2-13;test7.ml:114,12-42} + I/70 := I/69 + I/70 := I/70 + I/68{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/71 := sextend32 I/70{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 12] := new_value:I/71 (assign){test7.ml:120,2-13;test7.ml:116,4-52} + Parraysetu:I/72 := 1 + I/73 := signed int32 mut[b:V/62 + 16]{test7.ml:121,2-13;test7.ml:115,12-42} + I/74 := signed int32 mut[a:V/61 + 16]{test7.ml:121,2-13;test7.ml:114,12-42} + I/75 := I/74 + I/75 := I/75 + I/73{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/76 := sextend32 I/75{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 16] := new_value:I/76 (assign){test7.ml:121,2-13;test7.ml:116,4-52} + Parraysetu:I/77 := 1 + I/78 := signed int32 mut[b:V/62 + 20]{test7.ml:122,2-13;test7.ml:115,12-42} + I/79 := signed int32 mut[a:V/61 + 20]{test7.ml:122,2-13;test7.ml:114,12-42} + I/80 := I/79 + I/80 := I/80 + I/78{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/81 := sextend32 I/80{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 20] := new_value:I/81 (assign){test7.ml:122,2-13;test7.ml:116,4-52} + Parraysetu:I/82 := 1 + I/83 := 1 + R:I/0[%rax] := I/83 + return R:I/0[%rax] +*) +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %ld ; d1 = %ld; d2 = %ld ; d3 = %ld }" + (Int32_u.to_int32 t1.d0) + (Int32_u.to_int32 t1.d1) + (Int32_u.to_int32 t1.d2) + (Int32_u.to_int32 t1.d3) + +let print_array ~len ppf ( a : Int32_u.t array)= + for i = 0 to len - 1 do + let x = Int32_u.Array.unsafe_get a i in + Format.fprintf ppf "%ld " (x |> Int32_u.to_int32) + done + +let create_array ~len ~init = + let arr = Int32_u.Array.unsafe_create len in + for i = 0 to len-1 do + Int32_u.Array.unsafe_set arr i init + done; + arr + +let () = + let a = { d0 = #8l; d1 = #96l; d2 = -#10l; d3 = #0l } in + let b = { d0 = #80l; d1 = #14l; d2 = -#30l; d3 = -#100l } in + let c = { d0 = #8l; d1 = #96l; d2 = #0l; d3 = #0l } in + Format.printf "add_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + let ar1 = create_array ~len:4 ~init:#30l in + let ar2 = create_array ~len:4 ~init:#0l in + copy_array_four ar1 ar2 ~pos:0; + Format.printf "copy_array_four %a\n" (print_array ~len:4) ar2; + copy_array_index_four ar2 ar1 ~pos:#0l; + Format.printf "copy_array_index_four %a\n" (print_array ~len:4) ar1; + add_array_from_start ar1 ar2; + Format.printf "add_array_from_start %a\n" (print_array ~len:4) ar2; + copy_array_index_from_start ar2 ar1; + Format.printf "copy_array_index_from_start %a\n" (print_array ~len:4) ar1; + copy_array_from_start ar1 ar2; + Format.printf "copy_array_from_start %a\n" (print_array ~len:4) ar2; + copy_array_four_v2 ar1 ar2 ~pos:0; + Format.printf "copy_array_from_start_v2 %a\n" (print_array ~len:4) ar2; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..fef3d590f81 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1,3 @@ +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_four_v2) +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_index_from_start) +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_from_start) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.expected b/flambda-backend/tests/backend/vectorizer/test_int64.expected new file mode 100644 index 00000000000..21d3934339d --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64.expected @@ -0,0 +1,7 @@ +add_mutable_record { d0 = 88 ; d1 = 110 } +copy_mutable_record { d0 = 88 ; d1 = 110 } +add_mutable_record_fresh { d0 = 88 ; d1 = 110 } +copy_mutable_record_fresh { d0 = 88 ; d1 = 110 } +add_mutable_record_t4 { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 } +copy_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 80 ; d3 = 14 } +dup_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 8 ; d3 = 96 } diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.ml b/flambda-backend/tests/backend/vectorizer/test_int64.ml new file mode 100644 index 00000000000..95603dd7773 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64.ml @@ -0,0 +1,79 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +type t1 = + { mutable d0 : int64; + mutable d1 : int64 + } + +(* Can't vectorize because int64 are boxed. *) +let[@inline never] [@local never] [@specialize never] add_mutable_record + (a : t1) (b : t1) (c : t1) : t1 = + c.d0 <- Int64.add a.d0 b.d0; + c.d1 <- Int64.add a.d1 b.d1; + c + +(* Can't vectorize because memory write requires [caml_modify]. *) +let[@inline never] [@local never] [@specialize never] copy_mutable_record + (a : t1) (b : t1) : t1 = + b.d0 <- a.d0; + b.d1 <- a.d1; + b + +(* Can't vectorize because int64 are boxed *) +let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh + (a : t1) (b : t1) : t1 = + { d0 = Int64.add a.d0 b.d0; d1 = Int64.add a.d1 b.d1 } + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh + (a : t1) : t1 = + { d0 = a.d0; d1 = a.d1 } + +type t4 = + { mutable d0 : int64; + mutable d1 : int64; + mutable d2 : int64; + mutable d3 : int64 + } + +(* Can't vectorize because int64 are boxed. *) +let[@inline never] [@local never] [@specialize never] add_mutable_record_t4 + (a : t1) (b : t1) (c : t4) : t4 = + c.d0 <- Int64.add a.d0 b.d0; + c.d1 <- Int64.add a.d1 b.d1; + c.d2 <- Int64.add a.d0 b.d0; + c.d3 <- Int64.add a.d1 b.d1; + c + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4 + (a : t1) (b : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 } + +let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4 + (a : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 } + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" t1.d0 t1.d1 + +let print_t4 ppf (t4 : t4) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }" t4.d0 t4.d1 + t4.d2 t4.d3 + +let () = + let a = { d0 = 8L; d1 = 96L } in + let b = { d0 = 80L; d1 = 14L } in + let c = { d0 = 10L; d1 = -10L } in + let t4 = { d0 = 10L; d1 = -10L; d2 = 199L; d3 = 18L } in + let res = { d0 = 0L; d1 = -0L } in + Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c); + Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res); + Format.printf "add_mutable_record_fresh %a\n" print_t1 + (add_mutable_record_fresh a b); + Format.printf "copy_mutable_record_fresh %a\n" print_t1 + (copy_mutable_record_fresh c); + Format.printf "add_mutable_record_t4 %a\n" print_t4 + (add_mutable_record_t4 a b t4); + Format.printf "copy_mutable_record_t4 %a\n" print_t4 + (copy_mutable_record_t4 a b); + Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a); + () diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.mli b/flambda-backend/tests/backend/vectorizer/test_int64.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected new file mode 100644 index 00000000000..68b6515c901 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected @@ -0,0 +1,3 @@ +add_mutable_record { d0 = 88 ; d1 = 110 } +copy_mutable_record { d0 = 88 ; d1 = 110 } +add_fours_mutable_record { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 } diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml new file mode 100644 index 00000000000..d9371e65e8f --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml @@ -0,0 +1,61 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Int64_u = struct + type t = int64# + + external to_int64 : t -> (int64[@local_opt]) = "%box_int64" [@@warning "-187"] + + external of_int64 : (int64[@local_opt]) -> t = "%unbox_int64" [@@warning "-187"] + + let[@inline always] add x y = of_int64 (Int64.add (to_int64 x) (to_int64 y)) +end + +type t1 = { mutable d0 : int64# ; mutable d1: int64# } + +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Int64_u.add a.d0 b.d0; + c.d1 <- Int64_u.add a.d1 b.d1; + c + +let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit = + b.d0 <- a.d0; + b.d1 <- a.d1; + () + +type t2 = { + mutable d0 : int64# ; + mutable d1: int64# ; + mutable d2: int64# ; + mutable d3: int64# } + +let[@inline never] [@local never][@specialize never] add_fours_mutable_record (a : t1) (b: t1) (c : t2) : unit = + c.d0 <- Int64_u.add a.d0 b.d0; + c.d1 <- Int64_u.add a.d1 b.d1; + c.d2 <- Int64_u.add a.d0 b.d0; + c.d3 <- Int64_u.add a.d1 b.d1; + () + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" (Int64_u.to_int64 t1.d0) + (Int64_u.to_int64 t1.d1) + +let print_t4 ppf (t2 : t2) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }" + (Int64_u.to_int64 t2.d0) + (Int64_u.to_int64 t2.d1) + (Int64_u.to_int64 t2.d2) + (Int64_u.to_int64 t2.d3) + +let () = + let a = { d0 = #8L; d1 = #96L } in + let b = { d0 = #80L; d1 = #14L } in + let c = { d0 = #8L; d1 = #96L } in + let d = { d0 = #0L; d1 = #0L; d2 = #0L; d3 = #0L } in + let res = { d0 = #0L; d1 = -#10L } in + Format.printf "add_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + copy_mutable_record c res; + Format.printf "copy_mutable_record %a\n" print_t1 res; + add_fours_mutable_record a b d; + Format.printf "add_fours_mutable_record %a\n" print_t4 d; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli new file mode 100644 index 00000000000..5b909d90a8c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..61eea8dffce --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1,3 @@ +**** Vectorize selected computation: 5 groups, 10 scalar instructions, 5 vector instructions, cost = -5 (Test_int64_unboxed_vectorized.add_mutable_record) +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_unboxed_vectorized.copy_mutable_record) +**** Vectorize selected computation: 10 groups, 20 scalar instructions, 10 vector instructions, cost = -10 (Test_int64_unboxed_vectorized.add_fours_mutable_record) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected new file mode 100644 index 00000000000..6db1b67d70d --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected @@ -0,0 +1,3 @@ +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_vectorized.copy_mutable_record_fresh) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.copy_mutable_record_t4) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.dup_mutable_record_t4)