diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/PatchEntryPointMutate.cpp index 9a836ee2ae..a14100f8fa 100644 --- a/lgc/patch/PatchEntryPointMutate.cpp +++ b/lgc/patch/PatchEntryPointMutate.cpp @@ -402,8 +402,6 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu // ret void unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); Type *waveMaskTy = builder.getIntNTy(waveSize); - auto *chainBlock = BasicBlock::Create(func->getContext(), "chain.block", func); - auto *retBlock = BasicBlock::Create(func->getContext(), "ret.block", func); // For continufy based continuation, the vgpr list: LocalInvocationId, vcr, vsp, ... unsigned vcrIndexInVgpr = shaderInputs ? 1 : 0; auto *vcr = builder.CreateExtractValue(vgprArg, vcrIndexInVgpr); @@ -437,13 +435,21 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu execMask = builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_wwm, execMask); } - auto *isNullTarget = builder.CreateICmpEQ(targetVcr, builder.getInt32(0)); - builder.CreateCondBr(isNullTarget, retBlock, chainBlock); - - builder.SetInsertPoint(retBlock); - builder.CreateRetVoid(); + BasicBlock *chainBlock = nullptr; + // We only need to insert the return block if there is any return in original function, otherwise we just insert + // everything in the tail block. + if (!retInstrs.empty()) { + chainBlock = BasicBlock::Create(func->getContext(), "chain.block", func); + auto *retBlock = BasicBlock::Create(func->getContext(), "ret.block", func); + auto *isNullTarget = builder.CreateICmpEQ(targetVcr, builder.getInt32(0)); + builder.CreateCondBr(isNullTarget, retBlock, chainBlock); + + builder.SetInsertPoint(retBlock); + builder.CreateRetVoid(); + } - builder.SetInsertPoint(chainBlock); + if (chainBlock) + builder.SetInsertPoint(chainBlock); // Mask off metadata bits and setup jump target. Value *addr32 = builder.CreateAnd(targetVcr, builder.getInt32(~0x3fu)); AddressExtender addressExtender(func); diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc index 7afafefcb8..2b06d4351f 100644 --- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc +++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc @@ -74,15 +74,10 @@ entry: ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP40]]) ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]]) ; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP41]]) -; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP42]], 0 -; CHECK-NEXT: br i1 [[TMP44]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] -; CHECK: chain.block: -; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP42]], -64 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP45]], i64 0 -; CHECK-NEXT: [[TMP47:%.*]] = bitcast <2 x i32> [[TMP46]] to i64 -; CHECK-NEXT: [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32s(ptr inreg [[TMP48]], i32 inreg [[TMP43]], <16 x i32> inreg [[TMP33]], { i32, ptr addrspace(5), i32 } [[TMP13]], i32 0) +; CHECK-NEXT: [[TMP44:%.*]] = and i32 [[TMP42]], -64 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP44]], i64 0 +; CHECK-NEXT: [[TMP46:%.*]] = bitcast <2 x i32> [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32s(ptr inreg [[TMP47]], i32 inreg [[TMP43]], <16 x i32> inreg [[TMP33]], { i32, ptr addrspace(5), i32 } [[TMP13]], i32 0) ; CHECK-NEXT: unreachable -; CHECK: ret.block: -; CHECK-NEXT: ret void ; diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc index 9b1b1207c9..8ff0f669f8 100644 --- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc @@ -109,15 +109,10 @@ attributes #5 = { nounwind willreturn memory(none) } ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 [[TMP44]]) ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i32 [[TMP41]], [[TMP45]] ; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP46]]) -; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i32 [[TMP45]], 0 -; CHECK-NEXT: br i1 [[TMP48]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] -; CHECK: chain.block: -; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP45]], -64 -; CHECK-NEXT: [[TMP50:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP49]], i64 0 -; CHECK-NEXT: [[TMP51:%.*]] = bitcast <2 x i32> [[TMP50]] to i64 -; CHECK-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP52]], i32 inreg [[TMP47]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) +; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[TMP45]], -64 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP48]], i64 0 +; CHECK-NEXT: [[TMP50:%.*]] = bitcast <2 x i32> [[TMP49]] to i64 +; CHECK-NEXT: [[TMP51:%.*]] = inttoptr i64 [[TMP50]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP51]], i32 inreg [[TMP47]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) ; CHECK-NEXT: unreachable -; CHECK: ret.block: -; CHECK-NEXT: ret void ; diff --git a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc index f90c54bb6c..a7c926d7bf 100644 --- a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc @@ -253,17 +253,12 @@ attributes #7 = { nounwind willreturn memory(inaccessiblemem: read) } ; CHECK-NEXT: [[TMP100:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP99]]) ; CHECK-NEXT: [[TMP101:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP98]]) ; CHECK-NEXT: [[TMP102:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP100]]) -; CHECK-NEXT: [[TMP103:%.*]] = icmp eq i32 [[TMP101]], 0 -; CHECK-NEXT: br i1 [[TMP103]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] -; CHECK: chain.block: -; CHECK-NEXT: [[TMP104:%.*]] = and i32 [[TMP101]], -64 -; CHECK-NEXT: [[TMP105:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP104]], i64 0 -; CHECK-NEXT: [[TMP106:%.*]] = bitcast <2 x i32> [[TMP105]] to i64 -; CHECK-NEXT: [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr -; CHECK-NEXT: call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32i32s(ptr inreg [[TMP107]], i32 inreg [[TMP102]], <20 x i32> inreg [[TMP92]], { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP65]], i32 0) +; CHECK-NEXT: [[TMP103:%.*]] = and i32 [[TMP101]], -64 +; CHECK-NEXT: [[TMP104:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP103]], i64 0 +; CHECK-NEXT: [[TMP105:%.*]] = bitcast <2 x i32> [[TMP104]] to i64 +; CHECK-NEXT: [[TMP106:%.*]] = inttoptr i64 [[TMP105]] to ptr +; CHECK-NEXT: call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32i32s(ptr inreg [[TMP106]], i32 inreg [[TMP102]], <20 x i32> inreg [[TMP92]], { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP65]], i32 0) ; CHECK-NEXT: unreachable -; CHECK: ret.block: -; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@_rgen_1.resume.0 diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc index 09803c6d58..f908fc797b 100644 --- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc @@ -109,17 +109,12 @@ define void @test.2({ ptr addrspace(32) } %state) !lgc.cps !{i32 1} !lgc.shaders ; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP47]]) ; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP46]]) ; CHECK-NEXT: [[TMP50:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP48]]) -; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i32 [[TMP49]], 0 -; CHECK-NEXT: br i1 [[TMP51]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] -; CHECK: chain.block: -; CHECK-NEXT: [[TMP52:%.*]] = and i32 [[TMP49]], -64 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP52]], i64 0 -; CHECK-NEXT: [[TMP54:%.*]] = bitcast <2 x i32> [[TMP53]] to i64 -; CHECK-NEXT: [[TMP55:%.*]] = inttoptr i64 [[TMP54]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP55]], i32 inreg [[TMP50]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) +; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP49]], -64 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP51]], i64 0 +; CHECK-NEXT: [[TMP53:%.*]] = bitcast <2 x i32> [[TMP52]] to i64 +; CHECK-NEXT: [[TMP54:%.*]] = inttoptr i64 [[TMP53]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP54]], i32 inreg [[TMP50]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) ; CHECK-NEXT: unreachable -; CHECK: ret.block: -; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@test.1 @@ -172,17 +167,12 @@ define void @test.2({ ptr addrspace(32) } %state) !lgc.cps !{i32 1} !lgc.shaders ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP40]]) ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]]) ; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP41]]) -; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP42]], 0 -; CHECK-NEXT: br i1 [[TMP44]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] -; CHECK: chain.block: -; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP42]], -64 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP45]], i64 0 -; CHECK-NEXT: [[TMP47:%.*]] = bitcast <2 x i32> [[TMP46]] to i64 -; CHECK-NEXT: [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5s(ptr inreg [[TMP48]], i32 inreg [[TMP43]], <16 x i32> inreg [[TMP33]], { i32, ptr addrspace(5) } [[TMP13]], i32 0) +; CHECK-NEXT: [[TMP44:%.*]] = and i32 [[TMP42]], -64 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP44]], i64 0 +; CHECK-NEXT: [[TMP46:%.*]] = bitcast <2 x i32> [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5s(ptr inreg [[TMP47]], i32 inreg [[TMP43]], <16 x i32> inreg [[TMP33]], { i32, ptr addrspace(5) } [[TMP13]], i32 0) ; CHECK-NEXT: unreachable -; CHECK: ret.block: -; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@test.2 diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc index 985a323f92..5d60d2d644 100644 --- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc @@ -119,17 +119,12 @@ else: ; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP47]]) ; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP46]]) ; CHECK-NEXT: [[TMP50:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP48]]) -; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i32 [[TMP49]], 0 -; CHECK-NEXT: br i1 [[TMP51]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] -; CHECK: chain.block: -; CHECK-NEXT: [[TMP52:%.*]] = and i32 [[TMP49]], -64 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP52]], i64 0 -; CHECK-NEXT: [[TMP54:%.*]] = bitcast <2 x i32> [[TMP53]] to i64 -; CHECK-NEXT: [[TMP55:%.*]] = inttoptr i64 [[TMP54]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP55]], i32 inreg [[TMP50]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) +; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP49]], -64 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP51]], i64 0 +; CHECK-NEXT: [[TMP53:%.*]] = bitcast <2 x i32> [[TMP52]] to i64 +; CHECK-NEXT: [[TMP54:%.*]] = inttoptr i64 [[TMP53]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP54]], i32 inreg [[TMP50]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) ; CHECK-NEXT: unreachable -; CHECK: ret.block: -; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@unify_jump_ret