Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4823,7 +4823,8 @@ // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate // in direction shift1 by Neg. The range [0, EltSize) means that we only need // to consider shift amounts with defined behavior. -static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) { +static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, + SelectionDAG &DAG) { // If EltSize is a power of 2 then: // // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) @@ -4858,9 +4859,13 @@ unsigned MaskLoBits = 0; if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { - if (NegC->getAPIntValue() == EltSize - 1) { + KnownBits Known; + DAG.computeKnownBits(Neg.getOperand(0), Known); + unsigned Bits = Log2_64(EltSize); + if (NegC->getAPIntValue().getActiveBits() <= Bits && + ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) { Neg = Neg.getOperand(0); - MaskLoBits = Log2_64(EltSize); + MaskLoBits = Bits; } } } @@ -4875,10 +4880,16 @@ // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with // Pos'. The truncation is redundant for the purpose of the equality. - if (MaskLoBits && Pos.getOpcode() == ISD::AND) - if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) - if (PosC->getAPIntValue() == EltSize - 1) + if (MaskLoBits && Pos.getOpcode() == ISD::AND) { + if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) { + KnownBits Known; + DAG.computeKnownBits(Pos.getOperand(0), Known); + if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits && + ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >= + MaskLoBits)) Pos = Pos.getOperand(0); + } + } // The condition we need is now: // @@ -4934,7 +4945,7 @@ // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); - if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) { + if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg).getNode(); Index: llvm/trunk/test/CodeGen/X86/combine-rotates.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-rotates.ll +++ llvm/trunk/test/CodeGen/X86/combine-rotates.ll @@ -61,27 +61,14 @@ define <4 x i32> @rotate_demanded_bits(<4 x i32>, <4 x i32>) { ; XOP-LABEL: rotate_demanded_bits: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [30,30,30,30] -; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm3 -; XOP-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOP-NEXT: vpsubd %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpsubd %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm3, %xmm0, %xmm0 +; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: rotate_demanded_bits: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30] -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vpsubd %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %3 = and <4 x i32> %1, %4 = shl <4 x i32> %0, %3 @@ -117,28 +104,15 @@ ; XOP-LABEL: rotate_demanded_bits_3: ; XOP: # %bb.0: ; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [30,30,30,30] -; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpshld %xmm3, %xmm0, %xmm3 -; XOP-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOP-NEXT: vpsubd %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpsubd %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: rotate_demanded_bits_3: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30] -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512-NEXT: vpsllvd %xmm3, %xmm0, %xmm3 -; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vpsubd %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %3 = shl <4 x i32> %1, %4 = and <4 x i32> %3, Index: llvm/trunk/test/CodeGen/X86/rotate4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/rotate4.ll +++ llvm/trunk/test/CodeGen/X86/rotate4.ll @@ -284,15 +284,9 @@ define i32 @rotate_demanded_bits(i32, i32) { ; CHECK-LABEL: rotate_demanded_bits: ; CHECK: # %bb.0: +; CHECK-NEXT: andb $30, %sil ; CHECK-NEXT: movl %esi, %ecx -; CHECK-NEXT: andl $30, %ecx -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll %cl, %eax -; CHECK-NEXT: negl %ecx -; CHECK-NEXT: andb $30, %cl -; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shrl %cl, %edi -; CHECK-NEXT: orl %eax, %edi +; CHECK-NEXT: roll %cl, %edi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: retq %3 = and i32 %1, 30 @@ -324,16 +318,10 @@ define i32 @rotate_demanded_bits_3(i32, i32) { ; CHECK-LABEL: rotate_demanded_bits_3: ; CHECK: # %bb.0: -; CHECK-NEXT: addl %esi, %esi -; CHECK-NEXT: movl %esi, %ecx -; CHECK-NEXT: andb $30, %cl -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll %cl, %eax -; CHECK-NEXT: negl %esi +; CHECK-NEXT: addb %sil, %sil ; CHECK-NEXT: andb $30, %sil ; CHECK-NEXT: movl %esi, %ecx -; CHECK-NEXT: shrl %cl, %edi -; CHECK-NEXT: orl %eax, %edi +; CHECK-NEXT: roll %cl, %edi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: retq %3 = shl i32 %1, 1