Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26489,10 +26489,18 @@ // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) + // select_cc seteq X, 0, 0, cttz -> and(cttz(X), sizeof(X) - 1) + // select_cc seteq X, 0, 0, cttz_zero_undef(X) -> and(cttz(X), sizeof(X) - 1) + // select_cc seteq X, 0, 0, ctlz -> and(ctlz(X), sizeof(X) - 1) + // select_cc seteq X, 0, 0, ctlz_zero_undef(X) -> and(ctlz(X), sizeof(X) - 1) // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) + // select_cc setne X, 0, cttz, 0 -> and(cttz(X), sizeof(X) - 1) + // select_cc setne X, 0, cttz_zero_undef(X), 0 -> and(cttz(X), sizeof(X) - 1) + // select_cc setne X, 0, ctlz, 0 -> and(ctlz(X), sizeof(X) - 1) + // select_cc setne X, 0, ctlz_zero_undef(X), 0 -> and(ctlz(X), sizeof(X) - 1) if (N1C && N1C->isZero() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue ValueOnZero = N2; SDValue Count = N3; @@ -26501,21 +26509,33 @@ std::swap(ValueOnZero, Count); // Check if the value on zero is a constant equal to the bits in the type. if (auto *ValueOnZeroC = dyn_cast(ValueOnZero)) { - if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { + unsigned VTSize = VT.getSizeInBits(); + bool canUseAND = !LegalOperations || TLI.isOperationLegal(ISD::AND, VT); + bool canTransformToAND = ValueOnZeroC->getAPIntValue().isZero() && + isPowerOf2_64(VTSize) && canUseAND; + SDValue Res; + if (canTransformToAND || ValueOnZeroC->getAPIntValue() == VTSize) { // If the other operand is cttz/cttz_zero_undef of N0, and cttz is // legal, combine to just cttz. if ((Count.getOpcode() == ISD::CTTZ || Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) - return DAG.getNode(ISD::CTTZ, DL, VT, N0); + Res = DAG.getNode(ISD::CTTZ, DL, VT, N0); // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is // legal, combine to just ctlz. if ((Count.getOpcode() == ISD::CTLZ || Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) - return DAG.getNode(ISD::CTLZ, DL, VT, N0); + Res = DAG.getNode(ISD::CTLZ, DL, VT, N0); + + if (Res) { + if (canTransformToAND) + return DAG.getNode(ISD::AND, DL, VT, Res, + DAG.getConstant(VTSize - 1, DL, VT)); + return Res; + } } } } Index: llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll =================================================================== --- llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll +++ llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll @@ -112,7 +112,7 @@ ; CHECK-LABEL: cttzlhsnot0: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rbit w9, w0 -; CHECK-NEXT: mov w8, #10 +; CHECK-NEXT: mov w8, #10 // =0xa ; CHECK-NEXT: clz w9, w9 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel w0, w8, w9, eq @@ -128,8 +128,7 @@ ; CHECK-LABEL: notcttz: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: clz w8, w0 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: csel w0, wzr, w8, eq +; CHECK-NEXT: and w0, w8, #0x1f ; CHECK-NEXT: ret entry: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) Index: llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -1069,9 +1069,9 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ffbh_u32_e32 v1, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_min_u32_e32 v0, 32, v0 +; SI-NEXT: v_and_b32_e32 v0, 31, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -1085,9 +1085,9 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_min_u32_e32 v0, 32, v0 +; VI-NEXT: v_and_b32_e32 v2, 31, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1097,7 +1097,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1109,9 +1109,11 @@ ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, ; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: FFBH_UINT * T0.W, T0.X, -; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: @@ -1153,9 +1155,9 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ffbh_u32_e32 v1, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_min_u32_e32 v0, 32, v0 +; SI-NEXT: v_and_b32_e32 v0, 31, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -1169,9 +1171,9 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_min_u32_e32 v0, 32, v0 +; VI-NEXT: v_and_b32_e32 v2, 31, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1181,7 +1183,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1193,9 +1195,11 @@ ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, ; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: FFBH_UINT * T0.W, T0.X, -; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: Index: llvm/test/CodeGen/SPARC/cttz.ll =================================================================== --- llvm/test/CodeGen/SPARC/cttz.ll +++ llvm/test/CodeGen/SPARC/cttz.ll @@ -15,9 +15,9 @@ ; CHECK-NEXT: add %o2, %lo(.LCPI0_0), %o2 ; CHECK-NEXT: ldub [%o2+%o1], %o1 ; CHECK-NEXT: cmp %o0, 0 -; CHECK-NEXT: move %icc, 0, %o1 +; CHECK-NEXT: move %icc, 32, %o1 ; CHECK-NEXT: retl -; CHECK-NEXT: mov %o1, %o0 +; CHECK-NEXT: and %o1, 31, %o0 entry: %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) %1 = icmp eq i32 %x, 0 @@ -44,16 +44,15 @@ ; CHECK-NEXT: srl %o3, 27, %o3 ; CHECK-NEXT: ldub [%o4+%o3], %o3 ; CHECK-NEXT: srl %o2, 27, %o2 -; CHECK-NEXT: ldub [%o4+%o2], %o4 -; CHECK-NEXT: add %o3, 32, %o2 -; CHECK-NEXT: cmp %o1, 0 -; CHECK-NEXT: movne %icc, %o4, %o2 -; CHECK-NEXT: or %o1, %o0, %o0 +; CHECK-NEXT: ldub [%o4+%o2], %o2 ; CHECK-NEXT: cmp %o0, 0 -; CHECK-NEXT: move %icc, 0, %o2 -; CHECK-NEXT: mov %g0, %o0 +; CHECK-NEXT: move %icc, 32, %o3 +; CHECK-NEXT: add %o3, 32, %o0 +; CHECK-NEXT: cmp %o1, 0 +; CHECK-NEXT: movne %icc, %o2, %o0 +; CHECK-NEXT: and %o0, 63, %o1 ; CHECK-NEXT: retl -; CHECK-NEXT: mov %o2, %o1 +; CHECK-NEXT: mov %g0, %o0 entry: %0 = call i64 @llvm.cttz.i64(i64 %x, i1 true) %1 = icmp eq i64 %x, 0