Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4622,9 +4622,9 @@ return ConstsNode; } -static SDValue getConstVector(ArrayRef Values, SmallBitVector &Undefs, +static SDValue getConstVector(ArrayRef Bits, SmallBitVector &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { - assert(Values.size() == Undefs.size() && "Unequal constant and undef arrays"); + assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; @@ -4637,16 +4637,22 @@ } MVT EltVT = ConstVecVT.getVectorElementType(); - for (unsigned i = 0, e = Values.size(); i != e; ++i) { + for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } - const APInt &V = Values[i]; + const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); + } else if (EltVT == MVT::f32) { + APFloat FV(APFloat::IEEEsingle, V); + Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); + } else if (EltVT == MVT::f64) { + APFloat FV(APFloat::IEEEdouble, V); + Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else { Ops.push_back(DAG.getConstant(V, dl, EltVT)); } @@ -5037,6 +5043,77 @@ return dyn_cast(CNode->getConstVal()); } +// Extract constant bits from constant pool vector. +static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, + SmallBitVector &UndefElts, + SmallVectorImpl &EltBits) { + assert(UndefElts.empty() && "Expected an empty UndefElts vector"); + assert(EltBits.empty() && "Expected an empty EltBits vector"); + + EVT VT = Op.getValueType(); + unsigned SizeInBits = VT.getSizeInBits(); + assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); + unsigned NumElts = SizeInBits / EltSizeInBits; + + auto *Cst = getTargetConstantFromNode(Op); + if (!Cst) + return false; + + Type *CstTy = Cst->getType(); + if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits())) + return false; + + // Extract all the undef/constant element data and pack into single bitsets. + APInt UndefBits(SizeInBits, 0); + APInt MaskBits(SizeInBits, 0); + + unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); + for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) { + auto *COp = Cst->getAggregateElement(i); + if (!COp || + !(isa(COp) || isa(COp) || + isa(COp))) + return false; + + if (isa(COp)) { + APInt EltUndef = APInt::getLowBitsSet(SizeInBits, CstEltSizeInBits); + UndefBits |= EltUndef.shl(i * CstEltSizeInBits); + continue; + } + + APInt Bits; + if (auto *CInt = dyn_cast(COp)) + Bits = CInt->getValue(); + else if (auto *CFP = dyn_cast(COp)) + Bits = CFP->getValueAPF().bitcastToAPInt(); + + Bits = Bits.zextOrTrunc(SizeInBits); + MaskBits |= Bits.shl(i * CstEltSizeInBits); + } + + UndefElts = SmallBitVector(NumElts, false); + EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); + + // Now extract the undef/constant bit data into the target elts. + for (unsigned i = 0; i != NumElts; ++i) { + APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits); + UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits); + + // Only treat the element as UNDEF if all bits are UNDEF, otherwise + // treat it as zero. + if (UndefEltBits.isAllOnesValue()) { + UndefElts[i] = true; + continue; + } + + APInt Bits = MaskBits.lshr(i * EltSizeInBits); + Bits = Bits.zextOrTrunc(EltSizeInBits); + EltBits[i] = Bits.getZExtValue(); + } + + return true; +} + static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask) { @@ -26308,6 +26385,93 @@ return false; } +// Attempt to constant fold all of the constant source ops. +// Returns true if the entire shuffle is folded to a constant. +// TODO: Extend this to merge multiple constant Ops and update the mask. +static bool combineX86ShufflesConstants(const SmallVectorImpl &Ops, + ArrayRef Mask, SDValue Root, + bool HasVariableMask, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + MVT VT = Root.getSimpleValueType(); + + unsigned SizeInBits = VT.getSizeInBits(); + unsigned NumMaskElts = Mask.size(); + unsigned MaskSizeInBits = SizeInBits / NumMaskElts; + unsigned NumOps = Ops.size(); + + // Extract constant bits from each source op. + bool OneUseConstantOp = false; + SmallVector UndefEltsOps(NumOps); + SmallVector, 4> RawBitsOps(NumOps); + for (unsigned i = 0; i != NumOps; ++i) { + SDValue SrcOp = Ops[i]; + OneUseConstantOp |= SrcOp.hasOneUse(); + if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i], + RawBitsOps[i])) + return false; + } + + // Only fold if at least one of the constants is only used once or + // the combined shuffle has included a variable mask shuffle, this + // is to avoid constant pool bloat. + if (!OneUseConstantOp && !HasVariableMask) + return false; + + // Shuffle the constant bits according to the mask. + SmallBitVector UndefElts(NumMaskElts, false); + SmallBitVector ZeroElts(NumMaskElts, false); + SmallBitVector ConstantElts(NumMaskElts, false); + SmallVector ConstantBitData(NumMaskElts, + APInt::getNullValue(MaskSizeInBits)); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) { + UndefElts[i] = true; + continue; + } else if (M == SM_SentinelZero) { + ZeroElts[i] = true; + continue; + } + assert(0 <= M && M < (int)(NumMaskElts * NumOps)); + + unsigned SrcOpIdx = (unsigned)M / NumMaskElts; + unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; + + auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; + if (SrcUndefElts[SrcMaskIdx]) { + UndefElts[i] = true; + continue; + } + + auto &SrcEltBits = RawBitsOps[SrcOpIdx]; + APInt &Bits = SrcEltBits[SrcMaskIdx]; + if (!Bits) { + ZeroElts[i] = true; + continue; + } + + ConstantElts[i] = true; + ConstantBitData[i] = Bits; + } + assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts); + + // Create the constant data. + MVT MaskSVT; + if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) + MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); + else + MaskSVT = MVT::getIntegerVT(MaskSizeInBits); + + MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); + + SDLoc DL(Root); + SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); + DCI.AddToWorklist(CstOp.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp)); + return true; +} + /// \brief Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -26491,6 +26655,11 @@ HasVariableMask, DAG, DCI, Subtarget)) return true; + // Attempt to constant fold all of the constant source ops. + if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI, + Subtarget)) + return true; + // We can only combine unary and binary shuffle mask cases. if (Ops.size() > 2) return false; Index: test/CodeGen/X86/vector-shuffle-combining-avx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -361,12 +361,12 @@ define <2 x double> @constant_fold_vpermilvar_pd() { ; X32-LABEL: constant_fold_vpermilvar_pd: ; X32: # BB#0: -; X32-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_pd: ; X64: # BB#0: -; X64-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00] ; X64-NEXT: retq %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> , <2 x i64> ) ret <2 x double> %1 @@ -375,12 +375,12 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() { ; X32-LABEL: constant_fold_vpermilvar_pd_256: ; X32: # BB#0: -; X32-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_pd_256: ; X64: # BB#0: -; X64-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00] ; X64-NEXT: retq %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> , <4 x i64> ) ret <4 x double> %1 @@ -389,12 +389,12 @@ define <4 x float> @constant_fold_vpermilvar_ps() { ; X32-LABEL: constant_fold_vpermilvar_ps: ; X32: # BB#0: -; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_ps: ; X64: # BB#0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00] ; X64-NEXT: retq %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> , <4 x i32> ) ret <4 x float> %1 @@ -403,14 +403,12 @@ define <8 x float> @constant_fold_vpermilvar_ps_256() { ; X32-LABEL: constant_fold_vpermilvar_ps_256: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_ps_256: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00] ; X64-NEXT: retq %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> , <8 x i32> ) ret <8 x float> %1 Index: test/CodeGen/X86/vector-shuffle-combining-avx2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -685,14 +685,12 @@ define <8 x i32> @constant_fold_permd() { ; X32-LABEL: constant_fold_permd: ; X32: # BB#0: -; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X32-NEXT: vpermd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_permd: ; X64: # BB#0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X64-NEXT: vpermd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; X64-NEXT: retq %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> , <8 x i32> ) ret <8 x i32> %1 @@ -701,14 +699,12 @@ define <8 x float> @constant_fold_permps() { ; X32-LABEL: constant_fold_permps: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X32-NEXT: vpermps {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_permps: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X64-NEXT: vpermps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] ; X64-NEXT: retq %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -717,14 +713,12 @@ define <32 x i8> @constant_fold_pshufb_256() { ; X32-LABEL: constant_fold_pshufb_256: ; X32: # BB#0: -; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_pshufb_256: ; X64: # BB#0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; X64-NEXT: retq %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 Index: test/CodeGen/X86/vector-shuffle-combining-ssse3.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -511,14 +511,12 @@ define <16 x i8> @constant_fold_pshufb() { ; SSE-LABEL: constant_fold_pshufb: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6] +; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; SSE-NEXT: retq ; ; AVX-LABEL: constant_fold_pshufb: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6] +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 Index: test/CodeGen/X86/vector-shuffle-combining-xop.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -261,14 +261,12 @@ define <2 x double> @constant_fold_vpermil2pd() { ; X32-LABEL: constant_fold_vpermil2pd: ; X32: # BB#0: -; X32-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00] -; X32-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2pd: ; X64: # BB#0: -; X64-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00] -; X64-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00] ; X64-NEXT: retq %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> , <2 x double> , <2 x i64> , i8 2) ret <2 x double> %1 @@ -277,16 +275,12 @@ define <4 x double> @constant_fold_vpermil2pd_256() { ; X32-LABEL: constant_fold_vpermil2pd_256: ; X32: # BB#0: -; X32-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2pd_256: ; X64: # BB#0: -; X64-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00] ; X64-NEXT: retq %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> , <4 x double> , <4 x i64> , i8 2) ret <4 x double> %1 @@ -295,16 +289,12 @@ define <4 x float> @constant_fold_vpermil2ps() { ; X32-LABEL: constant_fold_vpermil2ps: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2ps: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00] ; X64-NEXT: retq %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> , <4 x float> , <4 x i32> , i8 2) ret <4 x float> %1 @@ -313,16 +303,12 @@ define <8 x float> @constant_fold_vpermil2ps_256() { ; X32-LABEL: constant_fold_vpermil2ps_256: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2ps_256: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00] ; X64-NEXT: retq %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> , <8 x float> , <8 x i32> , i8 2) ret <8 x float> %1 @@ -331,16 +317,12 @@ define <16 x i8> @constant_fold_vpperm() { ; X32-LABEL: constant_fold_vpperm: ; X32: # BB#0: -; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpperm: ; X64: # BB#0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; X64-NEXT: retq %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> , <16 x i8> , <16 x i8> ) ret <16 x i8> %1 Index: test/CodeGen/X86/vector-shuffle-mmx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-mmx.ll +++ test/CodeGen/X86/vector-shuffle-mmx.ll @@ -42,10 +42,8 @@ ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: movlps %xmm0, (%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] -; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; X32-NEXT: xorl %edi, %edi ; X32-NEXT: maskmovq %mm1, %mm0 @@ -58,10 +56,8 @@ ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 ; X64-NEXT: xorl %edi, %edi ; X64-NEXT: maskmovq %mm1, %mm0 Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -18,13 +18,10 @@ define void @test(<4 x i16>* %a, <4 x i16>* %b) { ; AVX-LABEL: test: ; AVX: ## BB#0: ## %body -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vmovq %xmm1, (%rdi) -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: movq {{.*}}(%rip), %rax +; AVX-NEXT: movq %rax, (%rdi) +; AVX-NEXT: movq {{.*}}(%rip), %rax +; AVX-NEXT: movq %rax, (%rsi) ; AVX-NEXT: retq body: %predphi = select <4 x i1> , <4 x i16> , <4 x i16> Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -372,15 +372,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u> -; X86-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u> -; X86-NEXT: pshufb %xmm0, %xmm1 -; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; X86-NEXT: pextrw $0, %xmm1, (%edx) +; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-NEXT: pextrw $0, %xmm0, (%edx) ; X86-NEXT: movb $-98, 2(%edx) -; X86-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u> -; X86-NEXT: pshufb %xmm0, %xmm1 -; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; X86-NEXT: pextrw $0, %xmm0, (%ecx) ; X86-NEXT: movb $1, 2(%ecx) ; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -396,15 +391,10 @@ ; ; X64-LABEL: rot: ; X64: # BB#0: # %entry -; X64-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u> -; X64-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u> -; X64-NEXT: pshufb %xmm0, %xmm1 -; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; X64-NEXT: pextrw $0, %xmm1, (%rsi) +; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-NEXT: pextrw $0, %xmm0, (%rsi) ; X64-NEXT: movb $-98, 2(%rsi) -; X64-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u> -; X64-NEXT: pshufb %xmm0, %xmm1 -; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; X64-NEXT: pextrw $0, %xmm0, (%rdx) ; X64-NEXT: movb $1, 2(%rdx) ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero Index: test/CodeGen/X86/widen_shuffle-1.ll =================================================================== --- test/CodeGen/X86/widen_shuffle-1.ll +++ test/CodeGen/X86/widen_shuffle-1.ll @@ -111,16 +111,14 @@ ; X86-LABEL: shuf5: ; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33] -; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: movsd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: shuf5: ; X64: # BB#0: -; X64-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33] -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movq %rax, (%rdi) ; X64-NEXT: retq %v = shufflevector <2 x i8> , <2 x i8> undef, <8 x i32> store <8 x i8> %v, <8 x i8>* %p, align 8