Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -387,6 +387,8 @@ SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); + SDValue buildLoHiRotateVecBitmask(SDValue Shift, bool Low, + const SDLoc &DL, EVT VT); SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, @@ -4585,6 +4587,48 @@ return nullptr; } +// Are constant vectors combinable for rotate. +// Like <1, 2, 3, 4> + <31, 30, 29, 28> for int32 vectors. +static bool areNonSplatRotateCombinableVecs( + SDValue LShift, SDValue RShift, unsigned EltSizeInBits) { + BuildVectorSDNode *LV = dyn_cast(LShift); + BuildVectorSDNode *RV = dyn_cast(RShift); + if (!LV || !RV) return false; + if (LV->getNumOperands() != RV->getNumOperands()) return false; + + for (unsigned i = 0, e = LV->getNumOperands(); i != e; ++i) { + SDValue OpL = LV->getOperand(i); + SDValue OpR = RV->getOperand(i); + if (OpL.isUndef() || OpR.isUndef()) + return false; + ConstantSDNode *LConst = dyn_cast_or_null(OpL); + ConstantSDNode *RConst = dyn_cast_or_null(OpR); + if (!LConst || !RConst || + LConst->getZExtValue() + RConst->getZExtValue() != EltSizeInBits) + return false; + } + return true; +} + +// buildLoHiRotateVecBitmask - Generate vector of low/high bitmasks for vector +// of constant integer values (shift sizes). +// <0, 1, 2, 3> --> <0b0..0000, 0b0..0001, 0b0..0011, 0b0..0111> for Low=true +// <0, 1, 2, 3> --> <0b0000..0, 0b1000..0, 0b1100..0, 0b1110..0> for Low=false +SDValue DAGCombiner::buildLoHiRotateVecBitmask(SDValue Shift, bool Low, + const SDLoc &DL, EVT VT) { + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + const BuildVectorSDNode &BV = *cast(Shift); + + SmallVector Ops; + for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i) { + ConstantSDNode &Op = *cast(BV.getOperand(i)); + APInt Bits = Low ? APInt::getLowBitsSet(EltSizeInBits, Op.getZExtValue()) + : APInt::getHighBitsSet(EltSizeInBits, Op.getZExtValue()); + Ops.push_back(DAG.getConstant(Bits, DL, VT.getScalarType())); + } + return DAG.getBuildVector(VT, DL, Ops); +} + // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. @@ -4661,6 +4705,35 @@ return Rot.getNode(); } + if (areNonSplatRotateCombinableVecs(LHSShiftAmt, RHSShiftAmt, EltSizeInBits)) { + SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, + LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); + if (LHSMask.getNode() || RHSMask.getNode()) { + if (LHSMask.getNode() && RHSMask.getNode()) { + // calculating combined vector mask + // (LHSMask | 000...111) & (111...000 | RHSMask) + SDValue RHSBits = buildLoHiRotateVecBitmask(LHSShiftAmt, true, DL, VT); + SDValue LHSBits = buildLoHiRotateVecBitmask(RHSShiftAmt, false, DL, VT); + + SDValue LMask = DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits); + SDValue RMask = DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits); + + SDValue Mask = DAG.getNode(ISD::AND, DL, VT, LMask, RMask); + Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); + } + else if (LHSMask.getNode()) { + SDValue RHSBits = buildLoHiRotateVecBitmask(LHSShiftAmt, true, DL, VT); + SDValue LMask = DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits); + Rot = DAG.getNode(ISD::AND, DL, VT, Rot, LMask); + } + else if (RHSMask.getNode()) { + SDValue LHSBits = buildLoHiRotateVecBitmask(RHSShiftAmt, false, DL, VT); + SDValue RMask = DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits); + Rot = DAG.getNode(ISD::AND, DL, VT, Rot, RMask); + } + } + return Rot.getNode(); + } // If there is a mask here, and we have a variable shift, we can't be sure // that we're masking out the right stuff. Index: test/CodeGen/X86/rotate_vec.ll =================================================================== --- test/CodeGen/X86/rotate_vec.ll +++ test/CodeGen/X86/rotate_vec.ll @@ -15,9 +15,7 @@ define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) { ; CHECK-LABEL: rot_v4i32_non_splat: ; CHECK: # BB#0: -; CHECK-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %x, @@ -43,12 +41,8 @@ define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) { ; CHECK-LABEL: rot_v4i32_non_splat_2masks: ; CHECK: # BB#0: -; CHECK-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; CHECK-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %1 = lshr <4 x i32> %x, %2 = and <4 x i32> %1,