diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp --- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp +++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp @@ -96,6 +96,9 @@ if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; + if (TII->isKillTerminator(I->getOpcode())) + return true; + // These instructions are potentially expensive even if EXEC = 0. if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -396,8 +396,52 @@ ret void } +; CHECK-LABEL: {{^}}cbranch_kill: +; CHECK-NOT: exp null off, off, off, off done vm +define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) { +.entry: + %val0 = extractelement <2 x float> %1, i32 0 + %val1 = extractelement <2 x float> %1, i32 1 + %p0 = call float @llvm.amdgcn.interp.p1(float %val0, i32 immarg 0, i32 immarg 1, i32 %0) #2 + %sample = call float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 1, float %p0, float %p0, float %p0, float 0.000000e+00, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) + %cond0 = fcmp ugt float %sample, 0.000000e+00 + br i1 %cond0, label %live, label %kill + +kill: + call void @llvm.amdgcn.kill(i1 false) + br label %export + +live: + %i0 = call float @llvm.amdgcn.interp.p1(float %val0, i32 immarg 0, i32 immarg 0, i32 %0) #2 + %i1 = call float @llvm.amdgcn.interp.p2(float %i0, float %val1, i32 immarg 0, i32 immarg 0, i32 %0) #2 + %i2 = call float @llvm.amdgcn.interp.p1(float %val0, i32 immarg 1, i32 immarg 0, i32 %0) #2 + %i3 = call float @llvm.amdgcn.interp.p2(float %i2, float %val1, i32 immarg 1, i32 immarg 0, i32 %0) #2 + %scale.i0 = fmul reassoc nnan nsz arcp contract float %i0, %sample + %scale.i1 = fmul reassoc nnan nsz arcp contract float %i1, %sample + %scale.i2 = fmul reassoc nnan nsz arcp contract float %i2, %sample + %scale.i3 = fmul reassoc nnan nsz arcp contract float %i3, %sample + br label %export + +export: + %proxy.0.0 = phi float [ undef, %kill ], [ %scale.i0, %live ] + %proxy.0.1 = phi float [ undef, %kill ], [ %scale.i1, %live ] + %proxy.0.2 = phi float [ undef, %kill ], [ %scale.i2, %live ] + %proxy.0.3 = phi float [ undef, %kill ], [ %scale.i3, %live ] + %out.0 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %proxy.0.0, float %proxy.0.1) #2 + %out.1 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %proxy.0.2, float %proxy.0.3) #2 + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> %out.0, <2 x half> %out.1, i1 immarg true, i1 immarg true) #3 + ret void +} + +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #2 +declare float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare void @llvm.amdgcn.kill(i1) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind writeonly }