Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -8060,9 +8060,9 @@ /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); +static APInt computeZeroableShuffleElements(ArrayRef Mask, + SDValue V1, SDValue V2) { + APInt Zeroable(Mask.size(), 0); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); @@ -8077,7 +8077,7 @@ int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; + Zeroable.setBit(i); continue; } @@ -8095,17 +8095,19 @@ int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef() || X86::isZeroNode(Op)) - Zeroable[i] = true; + Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getAPIntValue(); Val = Val.lshr((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val = Val.lshr((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } continue; } @@ -8119,7 +8121,8 @@ SDValue Op = V.getOperand((M * Scale) + j); AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); } - Zeroable[i] = AllZeroable; + if (AllZeroable) + Zeroable.setBit(i); continue; } } @@ -8134,12 +8137,12 @@ // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. -static bool isNonZeroElementsInOrder(const SmallBitVector &Zeroable, +static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. - for (int i = 0, e = Zeroable.size(); i < e; i++) { + for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] < 0) @@ -8163,7 +8166,7 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8218,19 +8221,9 @@ const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); -// Function convertBitVectorToUnsigned - The function gets SmallBitVector -// as argument and convert him to unsigned. -// The output of the function is not(zeroable) -static unsigned convertBitVectorToUnsigned(const SmallBitVector &Zeroable) { - unsigned convertBit = 0; - for (int i = 0, e = Zeroable.size(); i < e; i++) - convertBit |= !(Zeroable[i]) << i; - return convertBit; -} - // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8238,7 +8231,7 @@ if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); - unsigned VEXPANDMask = convertBitVectorToUnsigned(Zeroable); + unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); @@ -8372,7 +8365,7 @@ /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "Floating point types are not supported"); MVT EltVT = VT.getVectorElementType(); @@ -8441,7 +8434,7 @@ /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); @@ -8899,7 +8892,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef Mask, int MaskOffset, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -8961,7 +8954,7 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8997,12 +8990,12 @@ /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) @@ -9300,7 +9293,7 @@ /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; @@ -9456,7 +9449,7 @@ /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); @@ -9810,7 +9803,7 @@ // elements are zeroable. static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); @@ -9899,7 +9892,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); @@ -10034,7 +10027,7 @@ /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10116,7 +10109,7 @@ /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10335,7 +10328,7 @@ /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10418,7 +10411,7 @@ /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10985,7 +10978,7 @@ /// blend if only one input is used. static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse, + const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SDValue V1Mask[16]; SDValue V2Mask[16]; @@ -11046,7 +11039,7 @@ /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11230,7 +11223,7 @@ /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11519,7 +11512,7 @@ /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { @@ -11775,7 +11768,7 @@ /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector WidenedMask; @@ -12310,7 +12303,7 @@ /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12407,7 +12400,7 @@ /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12495,7 +12488,7 @@ /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12586,7 +12579,7 @@ /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12690,7 +12683,7 @@ /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12776,7 +12769,7 @@ /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12849,7 +12842,7 @@ /// together based on the available instructions. static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we @@ -13001,7 +12994,7 @@ /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13057,7 +13050,7 @@ /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13103,7 +13096,7 @@ /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13168,7 +13161,7 @@ /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13239,7 +13232,7 @@ /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13290,7 +13283,7 @@ /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13350,7 +13343,7 @@ /// together based on the available instructions. static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && @@ -13572,8 +13565,8 @@ // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - if (Zeroable.all()) + APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); // Try to collapse shuffles into using a vector type with fewer elements but @@ -26541,10 +26534,11 @@ unsigned NumMaskElts = Mask.size(); bool ContainsZeros = false; - SmallBitVector Zeroable(NumMaskElts, false); + APInt Zeroable(NumMaskElts, false); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; - Zeroable[i] = isUndefOrZero(M); + if (isUndefOrZero(M)) + Zeroable.setBit(i); ContainsZeros |= (M == SM_SentinelZero); } @@ -26825,12 +26819,12 @@ // Attempt to combine to INSERTPS. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { - SmallBitVector Zeroable(4, false); + APInt Zeroable(4, 0); for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] < 0) - Zeroable[i] = true; + Zeroable.setBit(i); - if (Zeroable.any() && + if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32;