diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1050,6 +1050,21 @@ {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } else { + // The result of a boolean operation is represented as a 32-bit/64-bit sgpr + // with bits set potentially even for inactive lanes, so mask them out here. + unsigned CondOpc = CI->getOpcode(); + if (CondOpc == ISD::AND || CondOpc == ISD::OR || CondOpc == ISD::XOR) { + auto ST = static_cast(Subtarget); + CI = SDValue( + CurDAG->getMachineNode( + ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, DL, + MVT::i1, + CurDAG->getRegister( + ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, MVT::i1), + CI), + 0); + } + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO; CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI}); diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX10 + +; Test that unused lanes are masked out in the s_xor result before it is used as condition code. + +; CHECK-LABEL: {{^}}combine_add_zext_xor: + +; GFX9: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]] +; GFX9: s_and_b64 [[CC:s\[[0-9]+:[0-9]+\]]], exec, [[XOR]] +; GFX9: s_cmp_lg_u64 [[CC]], 0 +; GFX10: s_xor_b32 [[XOR:s[0-9]+]] +; GFX10: s_and_b32 [[CC:s[0-9]+]], exec_lo, [[XOR]] +; GFX10: s_cmpk_lg_u32 [[CC]], 0 +; CHECK: s_addc_u32 + +define void @combine_add_zext_xor() { +.entry: + br label %.exit + +.exit: ; preds = %10, %.entry + %.0311 = phi i32 [ 1050, %.entry ], [ 0, %10 ] + %.0 = phi i32 [ 0, %.entry ], [ %11, %10 ] + %.2 = phi i32 [ 0, %.entry ], [ %12, %10 ] + %0 = call <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32(i32 15, i32 %.0311, <8 x i32> undef, i32 0, i32 1) + %.i112 = extractelement <4 x i32> %0, i32 1 + %.i3 = extractelement <4 x i32> %0, i32 3 + br i1 undef, label %10, label %1 + +1: ; preds = %.exit + %2 = or i32 0, %.i112 + %3 = or i32 0, %2 + %4 = icmp eq i32 %3, 0 + %5 = or i32 %.i3, %3 + %6 = icmp eq i32 %5, 0 + %7 = icmp eq i32 %.i3, 1 + %8 = and i1 %7, %4 + %9 = or i1 %6, %8 + br label %10 + +10: ; preds = %1, %.exit + %.2.0.in.in = phi i1 [ %9, %1 ], [ undef, %.exit ] + %.2.0.in = xor i1 %.2.0.in.in, true + %.2.0 = zext i1 %.2.0.in to i32 + %11 = add i32 %.0, %.2.0 + %12 = add i32 %.2, %.2.0 + %13 = icmp sgt i32 %.0311, -1050 + br i1 %13, label %.exit, label %14 + +14: ; preds = %10 + %15 = add i32 %.2, %.2.0 + %16 = add i32 0, %.2.0 + %17 = add i32 %.0, %.2.0 + %18 = or i32 %17, %16 + %19 = or i32 %15, %18 + %20 = or i32 undef, %19 + ret void +} + +attributes #0 = { nounwind readonly willreturn } +declare <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0