diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10119,6 +10119,18 @@ // extra instruction anyway. if (!isBoolSGPR(Cond)) break; + + // Boolean operations are represented as a 32-bit sgpr, with bits set + // potentially even for inactive lanes. We cannot just strip zext here, + // because it is mapped to v_cndmask returning only 1 or 0, which has + // the effect of masking out bits for inactive lanes. Instead, set + // the condition code based on the extended value. + if (Opc == ISD::ZERO_EXTEND && + (Cond.getOpcode() == ISD::AND || Cond.getOpcode() == ISD::OR || + Cond.getOpcode() == ISD::XOR)) + Cond = DAG.getSetCC(SL, MVT::i1, RHS, DAG.getConstant(0, SL, MVT::i32), + ISD::SETNE); + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; @@ -10160,6 +10172,13 @@ // extra instruction anyway. if (!isBoolSGPR(Cond)) break; + + if (Opc == ISD::ZERO_EXTEND && + (Cond.getOpcode() == ISD::AND || Cond.getOpcode() == ISD::OR || + Cond.getOpcode() == ISD::XOR)) + Cond = DAG.getSetCC(SL, MVT::i1, RHS, DAG.getConstant(0, SL, MVT::i32), + ISD::SETNE); + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -0,0 +1,56 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s + +; Test that v_cndmask/v_cmp is added over the s_xor result and that one is used as a condition code. + +; CHECK-LABEL: {{^}}combine_add_zext_xor: +; CHECK: s_xor_b32 [[XOR:s[0-9]+]] +; CHECK: v_cndmask_b32_e64 [[XOR_ZEXT:v[0-9]+]], 0, 1, [[XOR]] +; CHECK: v_cmp_ne_u32_e32 vcc_lo, 0, [[XOR_ZEXT]] +; CHECK: s_cmpk_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_addc_u32 + +define void @combine_add_zext_xor() { +.entry: + br label %.exit + +.exit: ; preds = %10, %.entry + %.0311 = phi i32 [ 1050, %.entry ], [ 0, %10 ] + %.0 = phi i32 [ 0, %.entry ], [ %11, %10 ] + %.2 = phi i32 [ 0, %.entry ], [ %12, %10 ] + %0 = call <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32(i32 15, i32 %.0311, <8 x i32> undef, i32 0, i32 5) + %.i112 = extractelement <4 x i32> %0, i32 1 + %.i3 = extractelement <4 x i32> %0, i32 3 + br i1 undef, label %10, label %1 + +1: ; preds = %.exit + %2 = or i32 0, %.i112 + %3 = or i32 0, %2 + %4 = icmp eq i32 %3, 0 + %5 = or i32 %.i3, %3 + %6 = icmp eq i32 %5, 0 + %7 = icmp eq i32 %.i3, 1 + %8 = and i1 %7, %4 + %9 = or i1 %6, %8 + br label %10 + +10: ; preds = %1, %.exit + %.2.0.in.in = phi i1 [ %9, %1 ], [ undef, %.exit ] + %.2.0.in = xor i1 %.2.0.in.in, true + %.2.0 = zext i1 %.2.0.in to i32 + %11 = add i32 %.0, %.2.0 + %12 = add i32 %.2, %.2.0 + %13 = icmp sgt i32 %.0311, -1050 + br i1 %13, label %.exit, label %14 + +14: ; preds = %10 + %15 = add i32 %.2, %.2.0 + %16 = add i32 0, %.2.0 + %17 = add i32 %.0, %.2.0 + %18 = or i32 %17, %16 + %19 = or i32 %15, %18 + %20 = or i32 undef, %19 + ret void +} + +attributes #0 = { nounwind readonly willreturn } +declare <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -219,7 +219,9 @@ ; GCN-LABEL: {{^}}add_and: ; GCN: s_and_b64 [[CC:[^,]+]], -; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] +; GCN: v_cndmask_b32_e64 [[CCZEXT:v[0-9]+]], 0, 1, [[CC]] +; GCN: v_cmp_ne_u32_e32 vcc, 0, [[CCZEXT]] +; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, vcc ; GCN-NOT: v_cndmask ; GFX9-LABEL: {{^}}add_and: