Skip to content

Commit

Permalink
[AMDGPU] Add pseudo scalar trans instructions for GFX12 (llvm#75204)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrkusanin authored Dec 15, 2023
1 parent 2812cb0 commit 569ef8d
Show file tree
Hide file tree
Showing 18 changed files with 2,496 additions and 26 deletions.
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,12 @@ def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
"Has single-use VGPR hint instructions"
>;

def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
"HasPseudoScalarTrans",
"true",
"Has Pseudo Scalar Transcendental instructions"
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -1467,6 +1473,7 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
Expand Down Expand Up @@ -2009,6 +2016,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;

def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;

def HasGDS : Predicate<"Subtarget->hasGDS()">;

def HasGWS : Predicate<"Subtarget->hasGWS()">;
Expand Down
28 changes: 20 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3781,14 +3781,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case AMDGPU::G_FSQRT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
case AMDGPU::G_SSUBSAT:
case AMDGPU::G_UADDSAT:
case AMDGPU::G_USUBSAT:
case AMDGPU::G_FMAD:
case AMDGPU::G_FSQRT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
case AMDGPU::G_FLDEXP:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
Expand Down Expand Up @@ -4253,12 +4259,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sqrt:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
Expand Down Expand Up @@ -4315,6 +4316,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_sqrt: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)

DECODE_OPERAND_REG_7(SReg_32, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
DECODE_OPERAND_REG_7(SReg_64, OPW64)
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/GCNProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,10 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
// GCN GFX12.
//===----------------------------------------------------------------------===//

def : ProcessorModel<"gfx1200", GFX11SpeedModel,
def : ProcessorModel<"gfx1200", GFX12SpeedModel,
FeatureISAVersion12.Features
>;

def : ProcessorModel<"gfx1201", GFX11SpeedModel,
def : ProcessorModel<"gfx1201", GFX12SpeedModel,
FeatureISAVersion12.Features
>;
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool ScalarizeGlobal = false;
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
bool HasPseudoScalarTrans = false;

bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
Expand Down Expand Up @@ -1160,6 +1161,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5305,6 +5305,16 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
Expand Down Expand Up @@ -7189,7 +7199,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
if (isVOP3(NewOpcode)) {
if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
// Intersperse VOP3 modifiers among the SALU operands.
NewInstr->addOperand(Inst.getOperand(0));
if (AMDGPU::getNamedOperandIdx(NewOpcode,
Expand Down
36 changes: 36 additions & 0 deletions llvm/lib/Target/AMDGPU/SISchedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def Write8PassDGEMM : SchedWrite;
// Scalar float instructions
def WriteSFPU : SchedWrite;

// F16 or F32 pseudo scalar transcendental instructions
def WritePseudoScalarTrans : SchedWrite;

// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
Expand All @@ -93,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
def GFX12SpeedModel : SISchedMachineModel;

// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
Expand Down Expand Up @@ -174,6 +178,7 @@ multiclass SICommonWriteRes {
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;

def : UnsupportedWriteRes<WriteSFPU>;
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1

def : ReadAdvance<MIVGPRRead, -2>;
Expand Down Expand Up @@ -318,6 +323,7 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;

def : UnsupportedWriteRes<WriteSFPU>;
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1

def : InstRW<[WriteCopy], (instrs COPY)>;
Expand Down Expand Up @@ -351,6 +357,36 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
} // End RetireOOO = 1

def : UnsupportedWriteRes<WritePseudoScalarTrans>;

def : InstRW<[WriteCopy], (instrs COPY)>;

} // End SchedModel = GFX11SpeedModel

let SchedModel = GFX12SpeedModel in {

def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>;

def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;

def : InstRW<[WriteCopy], (instrs COPY)>;

} // End SchedModel = GFX12SpeedModel
34 changes: 22 additions & 12 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -675,19 +675,8 @@ let SubtargetPredicate = isGFX12Plus in {

} // End SubtargetPredicate = isGFX12Plus

def SelectPat : PatFrag <
(ops node:$src1, node:$src2),
(select SCC, $src1, $src2),
[{ return !N->isDivergent(); }]
>;

let Uses = [SCC] in {
let AddedComplexity = 20 in {
def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
[(set i32:$sdst, (SelectPat i32:$src0, i32:$src1))]
>;
}

def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]

Expand Down Expand Up @@ -1808,6 +1797,27 @@ def : GetFPModePat<fpmode_mask_gfx6plus>;
// SOP2 Patterns
//===----------------------------------------------------------------------===//

def UniformSelect : PatFrag<
(ops node:$src0, node:$src1),
(select SCC, $src0, $src1),
[{ return !N->isDivergent(); }]
>;

let AddedComplexity = 20 in {
def : GCNPat<
(i32 (UniformSelect i32:$src0, i32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;

// TODO: The predicate should not be necessary, but enabling this pattern for
// all subtargets generates worse code in some cases.
let OtherPredicates = [HasPseudoScalarTrans] in
def : GCNPat<
(f32 (UniformSelect f32:$src0, f32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;
}

// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
Expand Down
53 changes: 53 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,49 @@ let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}

class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
ValueType dstVt, ValueType srcVt = dstVt>
: VOPProfile<[dstVt, srcVt, untyped, untyped]> {
let DstRC = VOPDstOperand<Dst>;
let Src0RC64 = SrcOp;

let HasOMod = 1;
let HasModifiers = 1;
}

def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;

let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32, AMDGPUexp>;
defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32, AMDGPUlog>;
defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32, AMDGPUrcp>;
defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32, AMDGPUrsq>;
defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32, any_amdgcn_sqrt>;
defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>;
}

class PseudoScalarPatF16<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat <
(f16 (UniformUnaryFrag<node> (f16 (VOP3Mods0 f16:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)))),
(f16 (COPY_TO_REGCLASS (f32 (inst i32:$src0_modifiers, f16:$src0, i1:$clamp,
i32:$omod)),
SReg_32_XEXEC))
>;

let SubtargetPredicate = HasPseudoScalarTrans in {
def : PseudoScalarPatF16<AMDGPUexpf16, V_S_EXP_F16_e64>;
def : PseudoScalarPatF16<AMDGPUlogf16, V_S_LOG_F16_e64>;
def : PseudoScalarPatF16<AMDGPUrcp, V_S_RCP_F16_e64>;
def : PseudoScalarPatF16<AMDGPUrsq, V_S_RSQ_F16_e64>;
def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
}

//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -933,6 +976,16 @@ defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>;
defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>;
defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>;
defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>;
defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>;
defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>;
defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>;
defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">;
defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">;
defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1303,6 +1303,19 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
} // end SubtargetPredicate = isGFX11Plus
}

class UniformUnaryFragOrOp<SDPatternOperator Op> {
SDPatternOperator ret = !if(!or(!isa<SDNode>(Op), !isa<PatFrags>(Op)),
UniformUnaryFrag<Op>, Op);
}

multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag> {
def _e64 : VOP3_Pseudo<OpName, P, [(set P.DstVT:$vdst,
(UniformUnaryFragOrOp<node>.ret
(P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp,
i32:$omod))))]>;
}

//===----------------------------------------------------------------------===//
// VOP3 DPP
//===----------------------------------------------------------------------===//
Expand Down
Loading

0 comments on commit 569ef8d

Please sign in to comment.