Merge branch 'global-name-is-module-id' into instance-syntax

lukemaurer · Oct 19, 2023 · 8b42e26 · 8b42e26
2 parents c7f2a29 + 156fd09
commit 8b42e26
Show file tree

Hide file tree

Showing 438 changed files with 43,746 additions and 16,731 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -8,9 +8,6 @@
 
 autoconf-aux            @mshinwell @xclerc
 configure.ac            @mshinwell @xclerc
-**/dune                 @mshinwell @xclerc
-**/dune-project         @mshinwell @xclerc
-Makefile.in             @mshinwell @xclerc
 
 flambda_backend.opam    @mshinwell @lthls
 

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -107,6 +107,14 @@ jobs:
       with:
         path: 'flambda_backend'
 
+    - name: Install AFL (for Linux workers)
+      if: matrix.os == 'ubuntu-latest'
+      run: sudo apt-get install afl++
+
+    - name: Install AFL (for macOS workers)
+      if: matrix.os == 'macos-latest'
+      run: HOMEBREW_NO_INSTALL_CLEANUP=TRUE brew install afl-fuzz
+
     - name: Cache OCaml 4.14 and dune
       uses: actions/cache@v2
       id: cache

diff --git a/.github/workflows/jane_syntax.yml b/.github/workflows/jane_syntax.yml
@@ -0,0 +1,28 @@
+name: jane-syntax-upstream-build
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        ocaml-compiler:
+          - "4.14.0"
+
+    steps:
+    - name: Checkout the Flambda backend repo
+      uses: actions/checkout@master
+      with:
+        path: 'flambda_backend'
+
+    - name: Setup OCaml ${{ matrix.ocaml-compiler }}
+      uses: ocaml/setup-ocaml@v2
+      with:
+        ocaml-compiler: ${{ matrix.ocaml-compiler }}
+
+    - name: Try building Jane_syntax and its dependencies with upstream OCaml
+      working-directory: flambda_backend
+      run: opam exec -- ocaml/tools/build_jane_syntax_with_active_opam_switch.sh
diff --git a/Makefile b/Makefile
@@ -183,4 +183,4 @@ ocamlopt:
 
 .ocamldebug: install
 	find _build/main -name '*.cmo' -type f -printf 'directory %h\n' | sort -u > .ocamldebug
-	echo "source ocaml/tools/debug_printers" >> .ocamldebug
+	echo "source _build/main/$(ocamldir)/tools/debug_printers" >> .ocamldebug
diff --git a/backend/.ocamlformat-enable b/backend/.ocamlformat-enable
@@ -16,3 +16,7 @@ peephole/**/*.ml
 peephole/**/*.mli
 regalloc/**/*.ml
 regalloc/**/*.mli
+amd64/simd.ml 
+arm64/simd.ml 
+amd64/simd_selection.ml 
+arm64/simd_selection.ml 
diff --git a/backend/CSEgen.ml b/backend/CSEgen.ml
@@ -247,8 +247,8 @@ method class_of_operation op =
   | Iintop_atomic _ -> Op_store true
   | Icompf _
   | Icsel _
-  | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
-  | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue -> Op_pure
+  | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf | Iscalarcast _
+  | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue | Ivectorcast _ -> Op_pure
   | Ispecific _ -> Op_other
   | Iname_for_debugger _ -> Op_other
   | Iprobe_is_enabled _ -> Op_other

diff --git a/backend/amd64/CSE.ml b/backend/amd64/CSE.ml
@@ -36,15 +36,18 @@ method! class_of_operation op =
     | Ibswap _ | Isqrtf -> super#class_of_operation op
     | Irdtsc | Irdpmc
     | Ilfence | Isfence | Imfence -> Op_other
-    | Ifloat_iround | Ifloat_min | Ifloat_max | Ifloat_round _
-    | Icrc32q -> Op_pure
+    | Ifloat_iround | Ifloat_min | Ifloat_max | Ifloat_round _ -> Op_pure
+    | Isimd op ->
+      begin match Simd.class_of_operation op with
+      | Pure -> Op_pure
+      end
     | Ipause
     | Iprefetch _ -> Op_other
     end
   | Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
   | Icompf _
   | Icsel _
-  | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
+  | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue | Ivectorcast _ | Iscalarcast _
   | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
   | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _ | Iextcall _
   | Istackoffset _ | Iload _ | Istore _ | Ialloc _

diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
@@ -31,6 +31,15 @@ let prefetchwt1_support = ref false
 (* Emit elf notes with trap handling information. *)
 let trap_notes = ref true
 
+(* Basline x86_64 requires SSE and SSE2. The others are optional. *)
+let sse3_support = ref true
+let ssse3_support = ref true
+let sse41_support = ref true
+let sse42_support = ref true
+
+(* Enable SIMD register allocation features. *)
+let simd_regalloc = ref false
+
 (* Machine-specific command-line options *)
 
 let command_line_options =
@@ -42,10 +51,6 @@ let command_line_options =
       " Use POPCNT instruction (not available prior to Nehalem) (default)";
     "-fno-popcnt", Arg.Clear popcnt_support,
       " Do not use POPCNT instruction";
-    "-fcrc32", Arg.Set crc32_support,
-      " Use CRC32 instructions (requires SSE4.2 support) (default)";
-    "-fno-crc32", Arg.Clear crc32_support,
-      " Do not emit CRC32 instructions";
     "-fprefetchw", Arg.Set prefetchw_support,
       " Use PREFETCHW instructions (not available on Haswell and earlier) \
         (default)";
@@ -58,7 +63,27 @@ let command_line_options =
     "-ftrap-notes", Arg.Set trap_notes,
       " Emit .note.ocaml_eh section with trap handling information (default)";
     "-fno-trap-notes", Arg.Clear trap_notes,
-      " Do not emit .note.ocaml_eh section with trap handling information"
+      " Do not emit .note.ocaml_eh section with trap handling information";
+    "-fsse3", Arg.Set sse3_support,
+      " Enable SSE3 intrinsics (default)";
+    "-fno-sse3", Arg.Clear sse3_support,
+      " Disable SSE3 intrinsics";
+    "-fssse3", Arg.Set ssse3_support,
+      " Enable SSSE3 intrinsics (default)";
+    "-fno-ssse3", Arg.Clear ssse3_support,
+      " Disable SSSE3 intrinsics";
+    "-fsse41", Arg.Set sse41_support,
+      " Enable SSE4.1 intrinsics (default)";
+    "-fno-sse41", Arg.Clear sse41_support,
+      " Disable SSE4.1 intrinsics";
+    "-fsse42", Arg.Set sse42_support,
+      " Enable SSE4.2 intrinsics (default)";
+    "-fno-sse42", Arg.Clear sse42_support,
+      " Disable SSE4.2 intrinsics";
+    "-fsimd-regalloc", Arg.Set simd_regalloc,
+      " Enable SIMD register allocation (implied by -extension SIMD)";
+    "-fno-simd-regalloc", Arg.Clear simd_regalloc,
+      " Disable SIMD register allocation (overridden by -extension SIMD) (default)";
   ]
 
 (* Specific operations for the AMD64 processor *)
@@ -111,8 +136,8 @@ type specific_operation =
   | Ilfence                            (* load fence *)
   | Isfence                            (* store fence *)
   | Imfence                            (* memory fence *)
-  | Icrc32q                            (* compute crc *)
   | Ipause                             (* hint for spin-wait loops *)
+  | Isimd of Simd.operation            (* vectorized operations *)
   | Iprefetch of                       (* memory prefetching hint *)
       { is_write: bool;
         locality: prefetch_temporal_locality_hint;
@@ -241,8 +266,8 @@ let print_specific_operation printreg op ppf arg =
       fprintf ppf "mfence"
   | Irdpmc ->
       fprintf ppf "rdpmc %a" printreg arg.(0)
-  | Icrc32q ->
-      fprintf ppf "crc32 %a %a" printreg arg.(0) printreg arg.(1)
+  | Isimd simd ->
+      Simd.print_operation printreg simd ppf arg
   | Ipause ->
       fprintf ppf "pause"
   | Iprefetch { is_write; locality; } ->
@@ -262,19 +287,19 @@ let operation_is_pure = function
   | Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32 -> true
   | Ifloatarithmem _ | Ifloatsqrtf _ -> true
   | Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max -> true
-  | Icrc32q -> true
   | Irdtsc | Irdpmc | Ipause
   | Ilfence | Isfence | Imfence
   | Istore_int (_, _, _) | Ioffset_loc (_, _)
   | Iprefetch _ -> false
+  | Isimd op -> Simd.is_pure op
 
 (* Specific operations that can raise *)
 
 let operation_can_raise = function
   | Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
   | Ifloatarithmem _ | Ifloatsqrtf _
   | Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
-  | Icrc32q | Irdtsc | Irdpmc | Ipause
+  | Irdtsc | Irdpmc | Ipause | Isimd _
   | Ilfence | Isfence | Imfence
   | Istore_int (_, _, _) | Ioffset_loc (_, _)
   | Iprefetch _ -> false
@@ -283,7 +308,7 @@ let operation_allocates = function
   | Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
   | Ifloatarithmem _ | Ifloatsqrtf _
   | Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
-  | Icrc32q | Irdtsc | Irdpmc | Ipause
+  | Irdtsc | Irdpmc | Ipause | Isimd _
   | Ilfence | Isfence | Imfence
   | Istore_int (_, _, _) | Ioffset_loc (_, _)
   | Iprefetch _ -> false
@@ -376,8 +401,6 @@ let equal_specific_operation left right =
     true
   | Imfence, Imfence ->
     true
-  | Icrc32q, Icrc32q ->
-    true
   | Ifloat_iround, Ifloat_iround -> true
   | Ifloat_round x, Ifloat_round y -> equal_rounding_mode x y
   | Ifloat_min, Ifloat_min -> true
@@ -388,8 +411,10 @@ let equal_specific_operation left right =
     Bool.equal left_is_write right_is_write
     && equal_prefetch_temporal_locality_hint left_locality right_locality
     && equal_addressing_mode left_addr right_addr
+  | Isimd l, Isimd r ->
+    Simd.equal_operation l r
   | (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _
     | Isqrtf | Ifloatsqrtf _ | Isextend32 | Izextend32 | Irdtsc | Irdpmc
     | Ilfence | Isfence | Imfence | Ifloat_iround | Ifloat_round _ |
-    Ifloat_min | Ifloat_max | Ipause | Icrc32q | Iprefetch _), _ ->
+    Ifloat_min | Ifloat_max | Ipause | Isimd _ | Iprefetch _), _ ->
     false
diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -41,7 +41,7 @@ let _label s = D.label ~typ:QWORD s
 
 (* Override proc.ml *)
 
-let simd_regalloc_disabled () = not (Language_extension.is_enabled SIMD)
+let simd_frontend_disabled () = not (Language_extension.is_enabled SIMD)
 
 let int_reg_name =
   [| RAX; RBX; RDI; RSI; RDX; RCX; R8; R9;
@@ -54,8 +54,8 @@ let register_name typ r =
   | Int | Val | Addr -> Reg64 (int_reg_name.(r))
   | Float -> Regf (float_reg_name.(r - 100))
   | Vec128 ->
-    if simd_regalloc_disabled () then
-      Misc.fatal_error "SIMD register allocation is not enabled.";
+    if simd_frontend_disabled ()
+    then Misc.fatal_error "SIMD types are not enabled, but got a Vec128 register.";
     Regf (float_reg_name.(r - 100))
 
 (* CFI directives *)
@@ -95,7 +95,8 @@ let frame_required = ref false
 
 let frame_size () =                     (* includes return address *)
   if !frame_required then begin
-    if simd_regalloc_disabled () then assert (num_stack_slots.(2) = 0);
+    if simd_frontend_disabled () && (num_stack_slots.(2) > 0)
+    then Misc.fatal_error "SIMD types are not enabled, but got a Vec128 stack slot.";
     let sz =
       (!stack_offset
        + 8
@@ -111,7 +112,8 @@ let slot_offset loc cl =
   match loc with
   | Incoming n -> frame_size() + n
   | Local n ->
-      if simd_regalloc_disabled () then assert (num_stack_slots.(2) = 0 && cl < 2);
+      if simd_frontend_disabled () && (num_stack_slots.(2) > 0 || cl >= 2)
+      then Misc.fatal_error "SIMD types are not enabled, but got a Vec128 stack slot.";
       (!stack_offset +
         (* Preserves original ordering (int -> float) *)
         match cl with
@@ -696,8 +698,8 @@ let add_vec128_constant bits =
     lbl
 
 let emit_vec128_constant {high; low} lbl =
-  _label (emit_label lbl);
   (* SIMD vectors respect little-endian byte order *)
+  _label (emit_label lbl);
   D.qword (Const low);
   D.qword (Const high)
 
@@ -906,6 +908,38 @@ let emit_atomic instr op (size : Cmm.atomic_bitwidth) addr =
     I.set E res8;
     I.movzx res8 res
 
+let emit_simd_instr op i =
+  (match Simd_selection.register_behavior op with
+  | RM_to_R ->
+    assert (Reg.is_reg i.res.(0))
+  | R_to_R ->
+    assert (Reg.is_reg i.arg.(0) && Reg.is_reg i.res.(0))
+  | R_RM_to_fst ->
+    assert (arg i 0 = res i 0);
+    assert (Reg.is_reg i.arg.(0))
+  | R_R_to_fst ->
+    assert (arg i 0 = res i 0);
+    assert (Reg.is_reg i.arg.(0) && Reg.is_reg i.arg.(1)));
+  match (op : Simd.operation) with
+  | SSE (Cmp_f32 n) -> I.cmpps n (arg i 1) (res i 0)
+  | SSE Add_f32 -> I.addps (arg i 1) (res i 0)
+  | SSE Sub_f32 -> I.subps (arg i 1) (res i 0)
+  | SSE Mul_f32 -> I.mulps (arg i 1) (res i 0)
+  | SSE Div_f32 -> I.divps (arg i 1) (res i 0)
+  | SSE Max_f32 -> I.maxps (arg i 1) (res i 0)
+  | SSE Min_f32 -> I.minps (arg i 1) (res i 0)
+  | SSE Rcp_f32 -> I.rcpps (arg i 0) (res i 0)
+  | SSE Sqrt_f32 -> I.sqrtps (arg i 0) (res i 0)
+  | SSE Rsqrt_f32 -> I.rsqrtps (arg i 0) (res i 0)
+  | SSE High_64_to_low_64 -> I.movhlps (arg i 1) (res i 0)
+  | SSE Low_64_to_high_64 -> I.movlhps (arg i 1) (res i 0)
+  | SSE Interleave_high_32 -> I.unpckhps (arg i 1) (res i 0)
+  | SSE Interleave_low_32 -> I.unpcklps (arg i 1) (res i 0)
+  | SSE Movemask_32 -> I.movmskps (arg i 0) (res i 0)
+  | SSE (Shuffle_32 n) -> I.shufps n (arg i 1) (res i 0)
+  | SSE42 Crc32_64 -> I.crc32 (arg i 1) (res i 0)
+  | _ -> .
+
 (* Emit an instruction *)
 let emit_instr fallthrough i =
   emit_debug_info_linear i;
@@ -1180,8 +1214,40 @@ let emit_instr fallthrough i =
       I.cvtsi2sd  (arg i 0)  (res i 0)
   | Lop(Iintoffloat) ->
       I.cvttsd2si (arg i 0) (res i 0)
-  | Lop(Iintofvalue | Ivalueofint) ->
+  | Lop(Iintofvalue | Ivalueofint | Ivectorcast Bits128) ->
       move i.arg.(0) i.res.(0)
+  | Lop(Iscalarcast (V128_of_scalar Float64x2 | V128_to_scalar Float64x2)) ->
+      I.movsd (arg i 0) (res i 0)
+  | Lop(Iscalarcast (V128_to_scalar Int64x2 | V128_of_scalar Int64x2)) ->
+      I.movq (arg i 0) (res i 0)
+  | Lop(Iscalarcast (V128_to_scalar Int32x4)) ->
+      I.movd (arg i 0) (res32 i 0)
+  | Lop(Iscalarcast (V128_of_scalar Int32x4)) ->
+      I.movd (arg32 i 0) (res i 0)
+  | Lop(Iscalarcast (V128_of_scalar Float32x4)) ->
+      (* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
+      I.cvtsd2ss (arg i 0) (res i 0)
+  | Lop(Iscalarcast (V128_to_scalar Float32x4)) ->
+      (* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
+      I.cvtss2sd (arg i 0) (res i 0)
+  | Lop(Iscalarcast (V128_to_scalar Int16x8)) ->
+      (* [movw] and [movzx] cannot operate on vector registers.
+         We must zero extend as the result is an untagged positive int.
+         CR mslater: (SIMD) remove zx once we have unboxed int16 *)
+      I.movd (arg i 0) (res32 i 0);
+      I.movzx (res16 i 0) (res i 0)
+  | Lop(Iscalarcast (V128_to_scalar Int8x16)) ->
+      (* [movb] and [movzx] cannot operate on vector registers.
+         We must zero extend as the result is an untagged positive int.
+         CR mslater: (SIMD) remove zx once we have unboxed int8 *)
+      I.movd (arg i 0) (res32 i 0);
+      I.movzx (res8 i 0) (res i 0)
+  | Lop(Iscalarcast (V128_of_scalar Int16x8 | V128_of_scalar Int8x16)) ->
+      (* [movw] and [movb] cannot operate on vector registers.
+         Moving 32 bits is OK because the argument is an untagged
+         positive int and these operations leave the top bits of the vector unspecified.
+         CR mslater: (SIMD) don't load 32 bits once we have unboxed int16/int8 *)
+      I.movd (arg32 i 0) (res i 0)
   | Lop(Iopaque) ->
       assert (i.arg.(0).loc = i.res.(0).loc)
   | Lop(Ispecific(Ilea addr)) ->
@@ -1295,9 +1361,8 @@ let emit_instr fallthrough i =
     I.sfence ()
   | Lop (Ispecific Imfence) ->
     I.mfence ()
-  | Lop (Ispecific Icrc32q) ->
-    assert (arg i 0 = res i 0);
-    I.crc32 (arg i 1) (res i 0)
+  | Lop (Ispecific (Isimd op)) ->
+    emit_simd_instr op i
   | Lop (Ispecific Ipause) ->
     I.pause ()
   | Lop (Ispecific (Iprefetch { is_write; locality; addr; })) ->
@@ -1632,8 +1697,8 @@ let make_stack_loc ~offset n (r : Reg.t) =
   (match r.typ with
    | Int | Val | Addr | Float -> ()
    | Vec128 ->
-    if simd_regalloc_disabled () then
-      Misc.fatal_error "SIMD register allocation is not enabled.");
+     if simd_frontend_disabled ()
+     then Misc.fatal_error "SIMD types are not enabled, but got a Vec128 register.");
   Reg.at_location r.typ loc
 
 (* CR mshinwell: Not now, but after code review, it would be better to