diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5054,14 +5054,95 @@ [](ConstantSDNode *C) { return !C->isZero(); })) return true; - // TODO: Recognize more cases here. + // TODO: Recognize more cases here. Most of the cases are also incomplete to + // some degree. switch (Op.getOpcode()) { - default: break; + default: + break; + case ISD::OR: - if (isKnownNeverZero(Op.getOperand(1), Depth + 1) || - isKnownNeverZero(Op.getOperand(0), Depth + 1)) + return isKnownNeverZero(Op.getOperand(1), Depth + 1) || + isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::VSELECT: + case ISD::SELECT: + return isKnownNeverZero(Op.getOperand(1), Depth + 1) && + isKnownNeverZero(Op.getOperand(2), Depth + 1); + + case ISD::SHL: + if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()) + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + + // 1 << X is never zero. TODO: This can be expanded if we can bound X. + // The expression is really !Known.One[BitWidth-MaxLog2(Known):0].isZero() + if (computeKnownBits(Op.getOperand(0)).One[0], Depth + 1) return true; break; + + case ISD::UADDSAT: + case ISD::UMAX: + return isKnownNeverZero(Op.getOperand(1), Depth + 1) || + isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::UMIN: + return isKnownNeverZero(Op.getOperand(1), Depth + 1) && + isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::ROTL: + case ISD::ROTR: + case ISD::BITREVERSE: + case ISD::BSWAP: + case ISD::CTPOP: + case ISD::ABS: + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::SRA: + case ISD::SRL: + if (Op->getFlags().hasExact()) + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + // Signed >> X is never zero. TODO: This can be expanded if we can bound X. + // The expression is really + // !Known.One[SignBit:SignBit-(BitWidth-MaxLog2(Known))].isZero() + if (computeKnownBits(Op.getOperand(0), Depth + 1).isNegative()) + return true; + break; + + case ISD::UDIV: + case ISD::SDIV: + // div exact can only produce a zero if the dividend is zero. + // TODO: For udiv this is also true if Op1 u<= Op0 + if (Op->getFlags().hasExact()) + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + break; + + case ISD::ADD: + if (Op->getFlags().hasNoUnsignedWrap()) + if (isKnownNeverZero(Op.getOperand(1), Depth + 1) || + isKnownNeverZero(Op.getOperand(0), Depth + 1)) + return true; + // TODO: There are a lot more cases we can prove for add. + break; + + case ISD::SUB: { + if (isNullConstant(Op.getOperand(0))) + return isKnownNeverZero(Op.getOperand(1), Depth + 1); + + std::optional ne = + KnownBits::ne(computeKnownBits(Op.getOperand(0), Depth + 1), + computeKnownBits(Op.getOperand(1), Depth + 1)); + return ne && *ne; + } + + case ISD::MUL: + if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()) + if (isKnownNeverZero(Op.getOperand(1), Depth + 1) && + isKnownNeverZero(Op.getOperand(0), Depth + 1)) + return true; + break; + + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + return isKnownNeverZero(Op.getOperand(0), Depth + 1); } return computeKnownBits(Op, Depth).isNonZero(); diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -780,7 +780,6 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -815,8 +814,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v0 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -826,7 +824,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -839,10 +837,9 @@ ; EG-NEXT: LSHL * T0.W, T1.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, -; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) +; EG-NEXT: FFBL_INT T0.X, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: @@ -912,9 +909,8 @@ ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_ffbl_b32_e32 v1, v1 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_min_u32_e32 v1, 0xffffffdf, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_min3_u32 v0, v0, v1, 64 +; SI-NEXT: v_min_u32_e32 v0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -974,7 +970,7 @@ ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: v_ffbl_b32_e32 v3, v3 -; VI-NEXT: v_add_u32_e64 v3, s[2:3], v3, 32 clamp +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) @@ -983,7 +979,7 @@ ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min3_u32 v0, v0, v3, 64 +; VI-NEXT: v_min_u32_e32 v0, v0, v3 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -993,27 +989,24 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 2, #1 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 4, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: LSHL * T0.W, T1.X, literal.x, +; EG-NEXT: LSHL T0.W, T1.X, literal.x, +; EG-NEXT: LSHL * T1.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, -; EG-NEXT: FFBL_INT T1.W, PV.W, -; EG-NEXT: LSHL * T2.W, T3.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W, -; EG-NEXT: OR_INT * T1.W, PS, T2.X, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T3.X, +; EG-NEXT: FFBL_INT T0.W, PV.W, +; EG-NEXT: OR_INT * T1.W, T1.W, T2.X, ; EG-NEXT: FFBL_INT T2.W, PS, ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) @@ -1137,7 +1130,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1151,8 +1144,6 @@ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, ; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) @@ -1259,7 +1250,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1273,8 +1264,6 @@ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, ; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) @@ -1337,7 +1326,6 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1375,7 +1363,6 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1387,7 +1374,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1400,9 +1387,7 @@ ; EG-NEXT: LSHL * T0.W, T1.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, -; EG-NEXT: FFBL_INT * T1.W, PV.W, -; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: FFBL_INT * T0.W, PV.W, ; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, @@ -1566,9 +1551,7 @@ ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/X86/divrem-by-select.ll b/llvm/test/CodeGen/X86/divrem-by-select.ll --- a/llvm/test/CodeGen/X86/divrem-by-select.ll +++ b/llvm/test/CodeGen/X86/divrem-by-select.ll @@ -67,20 +67,16 @@ ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} -; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx -; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax -; CHECK-X64-V4-NEXT: xorl %edx, %edx -; CHECK-X64-V4-NEXT: divq %rcx -; CHECK-X64-V4-NEXT: movq %rax, %rcx -; CHECK-X64-V4-NEXT: vmovq %xmm0, %rsi -; CHECK-X64-V4-NEXT: vmovq %xmm1, %rax -; CHECK-X64-V4-NEXT: xorl %edx, %edx -; CHECK-X64-V4-NEXT: divq %rsi +; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rdx +; CHECK-X64-V4-NEXT: movabsq $-3689348814741910323, %rax # imm = 0xCCCCCCCCCCCCCCCD +; CHECK-X64-V4-NEXT: mulxq %rax, %rcx, %rcx ; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0 -; CHECK-X64-V4-NEXT: vmovq %rax, %xmm1 -; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-X64-V4-NEXT: vmovq %xmm1, %rdx +; CHECK-X64-V4-NEXT: mulxq %rax, %rax, %rax +; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2 +; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; CHECK-X64-V4-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} +; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-X64-V4-NEXT: retq ;; Fails at the moment because `10` is even so there is no common @@ -115,23 +111,23 @@ ; ; CHECK-X64-V4-LABEL: udiv_indentity_non_zero: ; CHECK-X64-V4: # %bb.0: -; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] -; CHECK-X64-V4-NEXT: vpsubq %xmm0, %xmm2, %xmm3 {%k1} -; CHECK-X64-V4-NEXT: vpextrq $1, %xmm3, %rcx +; CHECK-X64-V4-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; CHECK-X64-V4-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; CHECK-X64-V4-NEXT: vpextrq $1, %xmm2, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx ; CHECK-X64-V4-NEXT: divq %rcx ; CHECK-X64-V4-NEXT: movq %rax, %rcx -; CHECK-X64-V4-NEXT: vmovq %xmm3, %rsi +; CHECK-X64-V4-NEXT: vmovq %xmm2, %rsi ; CHECK-X64-V4-NEXT: vmovq %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx ; CHECK-X64-V4-NEXT: divq %rsi +; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 +; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 ; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0 -; CHECK-X64-V4-NEXT: vmovq %rax, %xmm1 -; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2 +; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm2[0],xmm0[0] +; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-X64-V4-NEXT: retq %non_zero = add nsw nuw <2 x i64> %y, %d = select <2 x i1> %c, <2 x i64> %non_zero, <2 x i64> diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -47,9 +47,7 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: movl $122, %eax ; CHECK-NEXT: cmovnel %esi, %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %y = or i32 %x, 1 %z = select i1 %c, i32 %y, i32 122 @@ -85,9 +83,7 @@ ; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %z = shl i32 123, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -101,9 +97,7 @@ ; CHECK-NEXT: orl $256, %esi # imm = 0x100 ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: bsfl %esi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %esi, %eax ; CHECK-NEXT: retq %y = or i32 %yy, 256 %z = shl nsw i32 %y, %x @@ -118,9 +112,7 @@ ; CHECK-NEXT: orl $256, %esi # imm = 0x100 ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: bsfl %esi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %esi, %eax ; CHECK-NEXT: retq %y = or i32 %yy, 256 %z = shl nuw i32 %y, %x @@ -153,9 +145,7 @@ ; CHECK-NEXT: incl %edi ; CHECK-NEXT: movl $-1, %eax ; CHECK-NEXT: cmovnel %edi, %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %z = call i32 @llvm.uadd.sat.i32(i32 %x, i32 1) %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -190,9 +180,7 @@ ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: cmpl %eax, %edi ; CHECK-NEXT: cmoval %edi, %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %yy = shl nuw i32 4, %y %z = call i32 @llvm.umax.i32(i32 %x, i32 %yy) @@ -228,9 +216,7 @@ ; CHECK-NEXT: addl $4, %esi ; CHECK-NEXT: cmpl %esi, %eax ; CHECK-NEXT: cmovbl %eax, %esi -; CHECK-NEXT: bsfl %esi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %esi, %eax ; CHECK-NEXT: retq %x = shl nuw i32 4, %xx %y = add nuw nsw i32 %yy, 4 @@ -371,9 +357,7 @@ ; CHECK-NEXT: orl $256, %esi # imm = 0x100 ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: sarl %cl, %esi -; CHECK-NEXT: bsfl %esi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %esi, %eax ; CHECK-NEXT: retq %y = or i32 %yy, 256 %z = ashr exact i32 %y, %x @@ -407,9 +391,7 @@ ; CHECK-NEXT: movl $-2147360405, %eax # imm = 0x8001E16B ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %z = lshr i32 2147606891, %x %r = call i32 @llvm.cttz.i32(i32 %z, i1 false) @@ -423,9 +405,7 @@ ; CHECK-NEXT: orl $256, %esi # imm = 0x100 ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrl %cl, %esi -; CHECK-NEXT: bsfl %esi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %esi, %eax ; CHECK-NEXT: retq %y = or i32 %yy, 256 %z = lshr exact i32 %y, %x @@ -459,9 +439,7 @@ ; CHECK-NEXT: orl $64, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %esi -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %x = or i32 %xx, 64 %z = udiv exact i32 %x, %y @@ -495,9 +473,7 @@ ; CHECK-NEXT: orl $64, %eax ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %esi -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %x = or i32 %xx, 64 %z = sdiv exact i32 %x, %y @@ -529,9 +505,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: orl $1, %edi ; CHECK-NEXT: addl %esi, %edi -; CHECK-NEXT: bsfl %edi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %edi, %eax ; CHECK-NEXT: retq %x = or i32 %xx, 1 %z = add nuw i32 %x, %y @@ -565,9 +539,7 @@ ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: negl %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %x = shl nuw nsw i32 256, %xx %z = sub i32 0, %x @@ -582,9 +554,7 @@ ; CHECK-NEXT: orl $64, %eax ; CHECK-NEXT: andl $-65, %edi ; CHECK-NEXT: subl %eax, %edi -; CHECK-NEXT: bsfl %edi, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %edi, %eax ; CHECK-NEXT: retq %x = or i32 %xx, 64 %y = and i32 %xx, -65 @@ -745,9 +715,7 @@ ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %x = shl nuw nsw i16 256, %xx %z = zext i16 %x to i32 @@ -780,9 +748,7 @@ ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: bsfl %eax, %ecx -; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: rep bsfl %eax, %eax ; CHECK-NEXT: retq %x = shl nuw nsw i16 256, %xx %z = sext i16 %x to i32