diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1050,6 +1050,30 @@ {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } else { + // The result of a boolean operation is represented as a 32-bit/64-bit sgpr + // with bits set potentially even for inactive lanes, so mask them out here. + // Masking is not needed for SETCC and class intrinsics. We also exclude + // trivial patterns with two SETCCs connected by a boolean operation. + if (CI->getValueType(0) == MVT::i1) { + unsigned Opc = CI->getOpcode(); + bool SkipMasking = (Opc == ISD::SETCC || Opc == AMDGPUISD::FP_CLASS); + SkipMasking |= (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR) && + (CI->getOperand(0).getOpcode() == ISD::SETCC && + CI->getOperand(1).getOpcode() == ISD::SETCC); + + if (!SkipMasking) { + const auto *ST = static_cast(Subtarget); + CI = SDValue( + CurDAG->getMachineNode( + ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, DL, + MVT::i1, + CurDAG->getRegister( + ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, MVT::i1), + CI), + 0); + } + } + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO; CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI}); diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -0,0 +1,183 @@ +; RUN: llc -march=amdgcn -start-before=amdgpu-isel -stop-after=amdgpu-isel -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck %s + +; Test that unused lanes are masked out in the s_xor result before it is used as condition code. + +; CHECK-LABEL: name: combine_add_zext_xor +; CHECK: S_XOR_B64 +; CHECK-NEXT: S_AND_B64 $exec +; CHECK: S_ADD_CO_PSEUDO + +define void @combine_add_zext_xor() { +.entry: + br label %.exit, !amdgpu.uniform !0 + +.exit: ; preds = %bb7, %.entry + %t = phi i32 [ 1050, %.entry ], [ 0, %bb7 ] + %t1 = phi i32 [ 0, %.entry ], [ %t7, %bb7 ] + br i1 undef, label %bb7, label %bb, !amdgpu.uniform !0 + +bb: ; preds = %.exit + %t2 = add i32 %t, 2 + %t3 = icmp eq i32 %t2, 0 + br label %bb7, !amdgpu.uniform !0 + +bb7: ; preds = %bb, %.exit + %t4 = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t5 = xor i1 %t4, true + %t6 = zext i1 %t5 to i32 + %t7 = add i32 %t1, %t6 + br label %.exit, !amdgpu.uniform !0 +} + +; Test that unused lanes are masked out in the s_and result before it is used as condition code. + +; CHECK-LABEL: name: combine_add_zext_and +; CHECK: S_AND_B64 +; CHECK-NEXT: S_AND_B64 $exec +; CHECK: S_ADD_CO_PSEUDO + +define void @combine_add_zext_and() { +.entry: + br label %.exit, !amdgpu.uniform !0 + +.exit: ; preds = %bb7, %.entry + %t = phi i32 [ 1050, %.entry ], [ 0, %bb7 ] + %t1 = phi i32 [ 0, %.entry ], [ %t7, %bb7 ] + br i1 undef, label %bb7, label %bb, !amdgpu.uniform !0 + +bb: ; preds = %.exit + %t2 = add i32 %t, 2 + %t3 = icmp eq i32 %t2, 0 + br label %bb7, !amdgpu.uniform !0 + +bb7: ; preds = %bb, %.exit + %t4 = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t4a = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t5 = and i1 %t4, %t4a + %t6 = zext i1 %t5 to i32 + %t7 = add i32 %t1, %t6 + br label %.exit, !amdgpu.uniform !0 +} + +; Test that unused lanes are masked out in the s_or result before it is used as condition code. + +; CHECK-LABEL: name: combine_add_zext_or +; CHECK: S_OR_B64 +; CHECK-NEXT: S_AND_B64 $exec +; CHECK: S_ADD_CO_PSEUDO + +define void @combine_add_zext_or() { +.entry: + br label %.exit, !amdgpu.uniform !0 + +.exit: ; preds = %bb7, %.entry + %t = phi i32 [ 1050, %.entry ], [ 0, %bb7 ] + %t1 = phi i32 [ 0, %.entry ], [ %t7, %bb7 ] + br i1 undef, label %bb7, label %bb, !amdgpu.uniform !0 + +bb: ; preds = %.exit + %t2 = add i32 %t, 2 + %t3 = icmp eq i32 %t2, 0 + br label %bb7, !amdgpu.uniform !0 + +bb7: ; preds = %bb, %.exit + %t4 = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t4a = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t5 = or i1 %t4, %t4a + %t6 = zext i1 %t5 to i32 + %t7 = add i32 %t1, %t6 + br label %.exit, !amdgpu.uniform !0 +} + +; Test that unused lanes are masked out in the s_xor result before it is used as condition code. + +; CHECK-LABEL: name: combine_sub_zext_xor +; CHECK: S_XOR_B64 +; CHECK-NEXT: S_AND_B64 $exec +; CHECK: S_SUB_CO_PSEUDO + +define void @combine_sub_zext_xor() { +.entry: + br label %.exit, !amdgpu.uniform !0 + +.exit: ; preds = %bb7, %.entry + %t = phi i32 [ 1050, %.entry ], [ 0, %bb7 ] + %t1 = phi i32 [ 0, %.entry ], [ %t7, %bb7 ] + br i1 undef, label %bb7, label %bb, !amdgpu.uniform !0 + +bb: ; preds = %.exit + %t2 = add i32 %t, 2 + %t3 = icmp eq i32 %t2, 0 + br label %bb7, !amdgpu.uniform !0 + +bb7: ; preds = %bb, %.exit + %t4 = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t5 = xor i1 %t4, true + %t6 = zext i1 %t5 to i32 + %t7 = sub i32 %t1, %t6 + br label %.exit, !amdgpu.uniform !0 +} + +; Test that unused lanes are masked out in the s_and result before it is used as condition code. + +; CHECK-LABEL: name: combine_sub_zext_and +; CHECK: S_AND_B64 +; CHECK-NEXT: S_AND_B64 $exec +; CHECK: S_SUB_CO_PSEUDO + +define void @combine_sub_zext_and() { +.entry: + br label %.exit, !amdgpu.uniform !0 + +.exit: ; preds = %bb7, %.entry + %t = phi i32 [ 1050, %.entry ], [ 0, %bb7 ] + %t1 = phi i32 [ 0, %.entry ], [ %t7, %bb7 ] + br i1 undef, label %bb7, label %bb, !amdgpu.uniform !0 + +bb: ; preds = %.exit + %t2 = add i32 %t, 2 + %t3 = icmp eq i32 %t2, 0 + br label %bb7, !amdgpu.uniform !0 + +bb7: ; preds = %bb, %.exit + %t4 = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t4a = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t5 = and i1 %t4, %t4a + %t6 = zext i1 %t5 to i32 + %t7 = sub i32 %t1, %t6 + br label %.exit, !amdgpu.uniform !0 +} + +; Test that unused lanes are masked out in the s_or result before it is used as condition code. + +; CHECK-LABEL: name: combine_sub_zext_or +; CHECK: S_OR_B64 +; CHECK-NEXT: S_AND_B64 $exec +; CHECK: S_SUB_CO_PSEUDO + +define void @combine_sub_zext_or() { +.entry: + br label %.exit, !amdgpu.uniform !0 + +.exit: ; preds = %bb7, %.entry + %t = phi i32 [ 1050, %.entry ], [ 0, %bb7 ] + %t1 = phi i32 [ 0, %.entry ], [ %t7, %bb7 ] + br i1 undef, label %bb7, label %bb, !amdgpu.uniform !0 + +bb: ; preds = %.exit + %t2 = add i32 %t, 2 + %t3 = icmp eq i32 %t2, 0 + br label %bb7, !amdgpu.uniform !0 + +bb7: ; preds = %bb, %.exit + %t4 = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t4a = phi i1 [ %t3, %bb ], [ undef, %.exit ] + %t5 = or i1 %t4, %t4a + %t6 = zext i1 %t5 to i32 + %t7 = sub i32 %t1, %t6 + br label %.exit, !amdgpu.uniform !0 +} + +attributes #0 = { nounwind readonly willreturn } + +!0 = !{}