diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5007,14 +5007,97 @@ [](ConstantSDNode *C) { return !C->isZero(); })) return true; - // TODO: Recognize more cases here. + // TODO: Recognize more cases here. Most of the cases are also incomplete to + // some degree. switch (Op.getOpcode()) { - default: break; + default: + break; case ISD::OR: - if (isKnownNeverZero(Op.getOperand(1)) || - isKnownNeverZero(Op.getOperand(0))) + return isKnownNeverZero(Op.getOperand(0), Depth) || + isKnownNeverZero(Op.getOperand(1), Depth); + case ISD::VSELECT: + case ISD::SELECT: + return isKnownNeverZero(Op.getOperand(1), Depth) && + isKnownNeverZero(Op.getOperand(2), Depth); + case ISD::SHL: + if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()) + return isKnownNeverZero(Op.getOperand(0), Depth); + + // 1 << X is never zero. TODO: This can be expanded if we can bound X. + // The expression is really !Known.One[BitWidth-MaxLog2(Known):0].isZero() + if (computeKnownBits(Op.getOperand(0)).One[0], Depth) return true; break; + return isKnownNeverZero(Op.getOperand(0), Depth); + + case ISD::UADDSAT: + case ISD::UMAX: + return isKnownNeverZero(Op.getOperand(0), Depth) || + isKnownNeverZero(Op.getOperand(1), Depth); + + case ISD::UMIN: + return isKnownNeverZero(Op.getOperand(0), Depth) && + isKnownNeverZero(Op.getOperand(1), Depth); + + case ISD::ROTL: + case ISD::ROTR: + case ISD::BITREVERSE: + case ISD::BSWAP: + case ISD::CTPOP: + case ISD::ABS: + return isKnownNeverZero(Op.getOperand(0), Depth); + + case ISD::SRA: + case ISD::SRL: + if (Op->getFlags().hasExact()) + return isKnownNeverZero(Op.getOperand(0), Depth); + // Signed >> X is never zero. TODO: This can be expanded if we can bound X. + // The expression is really + // !Known.One[SignBit:SignBit-(BitWidth-MaxLog2(Known))].isZero() + if (computeKnownBits(Op.getOperand(0), Depth).isNegative()) + return true; + break; + case ISD::UDIV: + case ISD::SDIV: + // div exact can only produce a zero if the dividend is zero. + // TODO: For udiv this is also true if Op1 u<= Op0 + if (Op->getFlags().hasExact()) + return isKnownNeverZero(Op.getOperand(0), Depth); + + + break; + case ISD::ADD: { + if (Op->getFlags().hasNoUnsignedWrap()) + if (isKnownNeverZero(Op.getOperand(0), Depth) || + isKnownNeverZero(Op.getOperand(1), Depth)) + return true; + // TODO: There are a lot more cases we can prove for add. + break; + } + case ISD::SUB: { + if (isNullConstant(Op.getOperand(0))) + return isKnownNeverZero(Op.getOperand(1), Depth); + + std::optional ne = + KnownBits::ne(computeKnownBits(Op.getOperand(0), Depth), + computeKnownBits(Op.getOperand(1), Depth)); + return ne && *ne; + } + case ISD::MUL: { + if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()) + if (isKnownNeverZero(Op.getOperand(0), Depth) && + isKnownNeverZero(Op.getOperand(1), Depth)) + return true; + break; + } + case ISD::BITCAST: + if (!Op.getOperand(0).getValueType().isFloatingPoint()) + return isKnownNeverZero(Op.getOperand(0), Depth); + break; + + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + return isKnownNeverZero(Op.getOperand(0), Depth); } return !computeKnownBits(Op, Depth).One.isZero(); diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -780,7 +780,6 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -815,8 +814,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v0 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -826,7 +824,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -839,10 +837,9 @@ ; EG-NEXT: LSHL * T0.W, T1.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, -; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) +; EG-NEXT: FFBL_INT T0.X, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: @@ -912,9 +909,8 @@ ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_ffbl_b32_e32 v1, v1 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_min_u32_e32 v1, 0xffffffdf, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_min3_u32 v0, v0, v1, 64 +; SI-NEXT: v_min_u32_e32 v0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -974,7 +970,7 @@ ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: v_ffbl_b32_e32 v3, v3 -; VI-NEXT: v_add_u32_e64 v3, s[2:3], v3, 32 clamp +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) @@ -983,7 +979,7 @@ ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min3_u32 v0, v0, v3, 64 +; VI-NEXT: v_min_u32_e32 v0, v0, v3 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -993,27 +989,24 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 2, #1 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 4, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: LSHL * T0.W, T1.X, literal.x, +; EG-NEXT: LSHL T0.W, T1.X, literal.x, +; EG-NEXT: LSHL * T1.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, -; EG-NEXT: FFBL_INT T1.W, PV.W, -; EG-NEXT: LSHL * T2.W, T3.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W, -; EG-NEXT: OR_INT * T1.W, PS, T2.X, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T3.X, +; EG-NEXT: FFBL_INT T0.W, PV.W, +; EG-NEXT: OR_INT * T1.W, T1.W, T2.X, ; EG-NEXT: FFBL_INT T2.W, PS, ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) @@ -1137,7 +1130,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1151,8 +1144,6 @@ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, ; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) @@ -1259,7 +1250,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1273,8 +1264,6 @@ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, ; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) @@ -1337,7 +1326,6 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1375,7 +1363,6 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1387,7 +1374,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1400,9 +1387,7 @@ ; EG-NEXT: LSHL * T0.W, T1.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, -; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: FFBL_INT * T0.W, PV.W, ; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, @@ -1566,9 +1551,7 @@ ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/X86/divrem-by-select.ll b/llvm/test/CodeGen/X86/divrem-by-select.ll --- a/llvm/test/CodeGen/X86/divrem-by-select.ll +++ b/llvm/test/CodeGen/X86/divrem-by-select.ll @@ -67,20 +67,16 @@ ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} -; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx -; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax -; CHECK-X64-V4-NEXT: xorl %edx, %edx -; CHECK-X64-V4-NEXT: divq %rcx -; CHECK-X64-V4-NEXT: movq %rax, %rcx -; CHECK-X64-V4-NEXT: vmovq %xmm0, %rsi -; CHECK-X64-V4-NEXT: vmovq %xmm1, %rax -; CHECK-X64-V4-NEXT: xorl %edx, %edx -; CHECK-X64-V4-NEXT: divq %rsi +; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rdx +; CHECK-X64-V4-NEXT: movabsq $-3689348814741910323, %rax # imm = 0xCCCCCCCCCCCCCCCD +; CHECK-X64-V4-NEXT: mulxq %rax, %rcx, %rcx ; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0 -; CHECK-X64-V4-NEXT: vmovq %rax, %xmm1 -; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-X64-V4-NEXT: vmovq %xmm1, %rdx +; CHECK-X64-V4-NEXT: mulxq %rax, %rax, %rax +; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2 +; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; CHECK-X64-V4-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} +; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-X64-V4-NEXT: retq ;; Fails at the moment because `10` is even so there is no common @@ -115,23 +111,23 @@ ; ; CHECK-X64-V4-LABEL: udiv_indentity_non_zero: ; CHECK-X64-V4: # %bb.0: -; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] -; CHECK-X64-V4-NEXT: vpsubq %xmm0, %xmm2, %xmm3 {%k1} -; CHECK-X64-V4-NEXT: vpextrq $1, %xmm3, %rcx +; CHECK-X64-V4-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; CHECK-X64-V4-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; CHECK-X64-V4-NEXT: vpextrq $1, %xmm2, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx ; CHECK-X64-V4-NEXT: divq %rcx ; CHECK-X64-V4-NEXT: movq %rax, %rcx -; CHECK-X64-V4-NEXT: vmovq %xmm3, %rsi +; CHECK-X64-V4-NEXT: vmovq %xmm2, %rsi ; CHECK-X64-V4-NEXT: vmovq %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx ; CHECK-X64-V4-NEXT: divq %rsi +; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 +; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 ; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0 -; CHECK-X64-V4-NEXT: vmovq %rax, %xmm1 -; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2 +; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm2[0],xmm0[0] +; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-X64-V4-NEXT: retq %non_zero = add nsw nuw <2 x i64> %y, %d = select <2 x i1> %c, <2 x i64> %non_zero, <2 x i64>