Skip to content
This repository has been archived by the owner on May 14, 2024. It is now read-only.

Commit

Permalink
merge amd-stg-open into amd-mainline-open
Browse files Browse the repository at this point in the history
Change-Id: Id7dc4d766f6311c56260fbecef90fa9019da565a
  • Loading branch information
searlmc1 committed Jan 31, 2023
2 parents 8192b03 + 1806778 commit 38caab2
Show file tree
Hide file tree
Showing 72 changed files with 710 additions and 797 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ add_subdirectory(hip)
add_subdirectory(asanrtl)

enable_testing()
add_subdirectory(test/constant_folding)
add_subdirectory(test/compile)

include(Packages)

Expand Down
5 changes: 3 additions & 2 deletions cmake/OCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ endif()
# potential mis-aligned atomic ops detected by clang
set(CLANG_OCL_FLAGS -fcolor-diagnostics -Werror -Wno-error=atomic-alignment -x cl -Xclang
-cl-std=CL2.0 -target "${AMDGPU_TARGET_TRIPLE}" -fvisibility=protected -fomit-frame-pointer
-Xclang -finclude-default-header -nogpulib -cl-no-stdinc "${CLANG_OPTIONS_APPEND}")
-Xclang -finclude-default-header -Xclang -fexperimental-strict-floating-point
-nogpulib -cl-no-stdinc "${CLANG_OPTIONS_APPEND}")

# For compatibility with the MSVC headers we use a 32-bit wchar. Users linking
# against us must also use a short wchar.
Expand Down Expand Up @@ -135,7 +136,7 @@ macro(opencl_bc_lib)
# Extra link step with internalize
COMMAND $<TARGET_FILE:llvm-link> -internalize -only-needed "${name}.link0${LIB_SUFFIX}"
-o "${OUT_NAME}${LIB_SUFFIX}" ${internal_link_libs}
COMMAND $<TARGET_FILE:opt> -strip
COMMAND $<TARGET_FILE:opt> -passes=amdgpu-unify-metadata,strip
-o "${OUT_NAME}${STRIP_SUFFIX}" "${OUT_NAME}${LIB_SUFFIX}"
COMMAND "${PREPARE_BUILTINS}"
-o ${OUTPUT_BC_LIB} "${OUT_NAME}${STRIP_SUFFIX}"
Expand Down
83 changes: 9 additions & 74 deletions irif/inc/irif.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,82 +28,17 @@ extern __attribute__((const)) half2 __llvm_round_2f16(half2) __asm("llvm.round.v
extern __attribute__((const)) half2 __llvm_rint_2f16(half2) __asm("llvm.rint.v2f16");
extern __attribute__((const)) half2 __llvm_canonicalize_2f16(half2) __asm("llvm.canonicalize.v2f16");

// Intrinsics requiring wrapping
extern __attribute__((const)) uchar __llvm_ctlz_i8(uchar);
extern __attribute__((const)) ushort __llvm_ctlz_i16(ushort);
extern __attribute__((const)) uint __llvm_ctlz_i32(uint);
extern __attribute__((const)) ulong __llvm_ctlz_i64(ulong);

extern __attribute__((const)) uchar __llvm_cttz_i8(uchar);
extern __attribute__((const)) ushort __llvm_cttz_i16(ushort);
extern __attribute__((const)) uint __llvm_cttz_i32(uint);
extern __attribute__((const)) ulong __llvm_cttz_i64(ulong);

// Atomics
extern uint __llvm_ld_atomic_a1_x_dev_i32(__global uint *);
extern ulong __llvm_ld_atomic_a1_x_dev_i64(__global ulong *);
extern uint __llvm_ld_atomic_a3_x_wg_i32(__local uint *);
extern ulong __llvm_ld_atomic_a3_x_wg_i64(__local ulong *);

extern void __llvm_st_atomic_a1_x_dev_i32(__global uint *, uint);
extern void __llvm_st_atomic_a1_x_dev_i64(__global ulong *, ulong);
extern void __llvm_st_atomic_a3_x_wg_i32(__local uint *, uint);
extern void __llvm_st_atomic_a3_x_wg_i64(__local ulong *, ulong);

extern uint __llvm_atomic_add_a1_x_dev_i32(__global uint *, uint);
extern ulong __llvm_atomic_add_a1_x_dev_i64(__global ulong *, ulong);
extern uint __llvm_atomic_add_a3_x_wg_i32(__local uint *, uint);
extern ulong __llvm_atomic_add_a3_x_wg_i64(__local ulong *, ulong);

extern uint __llvm_atomic_and_a1_x_dev_i32(__global uint *, uint);
extern ulong __llvm_atomic_and_a1_x_dev_i64(__global ulong *, ulong);
extern uint __llvm_atomic_and_a3_x_wg_i32(__local uint *, uint);
extern ulong __llvm_atomic_and_a3_x_wg_i64(__local ulong *, ulong);

extern uint __llvm_atomic_or_a1_x_dev_i32(__global uint *, uint);
extern ulong __llvm_atomic_or_a1_x_dev_i64(__global ulong *, ulong);
extern uint __llvm_atomic_or_a3_x_wg_i32(__local uint *, uint);
extern ulong __llvm_atomic_or_a3_x_wg_i64(__local ulong *, ulong);

extern uint __llvm_atomic_max_a1_x_dev_i32(__global int *, int);
extern uint __llvm_atomic_umax_a1_x_dev_i32(__global uint *, uint);
extern ulong __llvm_atomic_max_a1_x_dev_i64(__global long *, long);
extern ulong __llvm_atomic_umax_a1_x_dev_i64(__global ulong *, ulong);
extern uint __llvm_atomic_max_a3_x_wg_i32(__local int *, int);
extern uint __llvm_atomic_umax_a3_x_wg_i32(__local uint *, uint);
extern ulong __llvm_atomic_max_a3_x_wg_i64(__local long *, long);
extern ulong __llvm_atomic_umax_a3_x_wg_i64(__local ulong *, ulong);

extern uint __llvm_atomic_min_a1_x_dev_i32(__global int *, int);
extern uint __llvm_atomic_umin_a1_x_dev_i32(__global uint *, uint);
extern ulong __llvm_atomic_min_a1_x_dev_i64(__global long *, long);
extern ulong __llvm_atomic_umin_a1_x_dev_i64(__global ulong *, ulong);
extern uint __llvm_atomic_min_a3_x_wg_i32(__local int *, int);
extern uint __llvm_atomic_umin_a3_x_wg_i32(__local uint *, uint);
extern ulong __llvm_atomic_min_a3_x_wg_i64(__local long *, long);
extern ulong __llvm_atomic_umin_a3_x_wg_i64(__local ulong *, ulong);

extern uint __llvm_cmpxchg_a1_x_x_dev_i32(__global uint *, uint, uint);
extern ulong __llvm_cmpxchg_a1_x_x_dev_i64(__global ulong *, ulong, ulong);
extern uint __llvm_cmpxchg_a3_x_x_wg_i32(__local uint *, uint, uint);
extern ulong __llvm_cmpxchg_a3_x_x_wg_i64(__local ulong *, ulong, ulong);
#define BUILTIN_CLZ_U8(x) (uchar)(x == 0u ? 8 : __builtin_clz(x) - 24)
#define BUILTIN_CLZ_U16(x) (ushort)(x == 0u ? 16 : __builtin_clzs(x))
#define BUILTIN_CLZ_U32(x) (uint)(x == 0u ? 32 : __builtin_clz(x))
#define BUILTIN_CLZ_U64(x) (ulong)(x == 0u ? 64 : __builtin_clzl(x))

// AMDGPU intrinsics

// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>

// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
extern uint __llvm_amdgcn_update_dpp_i32(uint, uint, uint, uint, uint, bool) __asm("llvm.amdgcn.update.dpp.i32");

// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
extern uint __llvm_amdgcn_dpp8_i32(uint, uint) __asm("llvm.amdgcn.dpp8.i32");

// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
extern uint __llvm_amdgcn_permlane16(uint, uint, uint, uint, bool, bool) __asm("llvm.amdgcn.permlane16");

// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
extern uint __llvm_amdgcn_permlanex16(uint, uint, uint, uint, bool, bool) __asm("llvm.amdgcn.permlanex16");
#define BUILTIN_CTZ_U8(x) (uchar)(x == 0u ? (uchar)8 : __builtin_ctz((uint)x))
#define BUILTIN_CTZ_U16(x) (ushort)(x == 0u ? 16 : __builtin_ctzs(x))
#define BUILTIN_CTZ_U32(x) (uint)(x == 0u ? 32 : __builtin_ctz(x))
#define BUILTIN_CTZ_U64(x) (ulong)(x == 0u ? 64 : __builtin_ctzl(x))

// AMDGPU intrinsics
extern __attribute__((const, convergent)) ulong __llvm_amdgcn_icmp_i64_i32(uint, uint, uint) __asm("llvm.amdgcn.icmp.i64.i32");
extern __attribute__((const, convergent)) ulong __llvm_amdgcn_icmp_i64_i64(ulong, ulong, uint) __asm("llvm.amdgcn.icmp.i64.i64");
extern __attribute__((const, convergent)) ulong __llvm_amdgcn_fcmp_i64_f32(float, float, uint) __asm("llvm.amdgcn.fcmp.i64.f32");
Expand Down
221 changes: 0 additions & 221 deletions irif/src/atomic.ll

This file was deleted.

24 changes: 0 additions & 24 deletions irif/src/cz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,11 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
target triple = "amdgcn-amd-amdhsa"

declare i8 @llvm.ctlz.i8(i8, i1) #0
declare i16 @llvm.ctlz.i16(i16, i1) #0
declare i32 @llvm.ctlz.i32(i32, i1) #0
declare i64 @llvm.ctlz.i64(i64, i1) #0
declare i8 @llvm.cttz.i8(i8, i1) #0
declare i16 @llvm.cttz.i16(i16, i1) #0
declare i32 @llvm.cttz.i32(i32, i1) #0
declare i64 @llvm.cttz.i64(i64, i1) #0

define protected i8 @__llvm_ctlz_i8(i8) #1 {
%2 = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
ret i8 %2
}

define protected i16 @__llvm_ctlz_i16(i16) #1 {
%2 = call i16 @llvm.ctlz.i16(i16 %0, i1 false)
ret i16 %2
}

define protected i32 @__llvm_ctlz_i32(i32) #1 {
%2 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
ret i32 %2
}

define protected i64 @__llvm_ctlz_i64(i64) #1 {
%2 = call i64 @llvm.ctlz.i64(i64 %0, i1 false)
ret i64 %2
}

define protected i8 @__llvm_cttz_i8(i8) #1 {
%2 = call i8 @llvm.cttz.i8(i8 %0, i1 false)
ret i8 %2
Expand Down
Loading

0 comments on commit 38caab2

Please sign in to comment.