Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -11284,6 +11284,74 @@ } } + // Lower shuffles of entire half vectors into subvector extracts/inserts: + // FIXME: Add 512-bit vector support. + if (VT.is256BitVector()) { + unsigned HalfNumElts = VT.getVectorNumElements() / 2; + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, dl)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, dl)); + } + + // vector_shuffle or + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, V1, + DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, DAG.getUNDEF(VT), Lo, + DAG.getIntPtrConstant(HalfNumElts, dl)); + } + + // vector_shuffle or + if (UndefUpper) { + SDValue Half1, Half2; + SmallVector HalfMask; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = SVOp->getMaskElt(i); + if (M < 0) { + HalfMask.push_back(M); + continue; + } + // If the shuffle only uses the lower halves of the inputs, + // then extract them and perform the 'half' shuffle. + // TODO: Investigate whether shuffles of other halves is beneficial. + if ((M % NumElements) >= (int)HalfNumElts) + break; + SDValue V = (M >= (int)NumElements ? V2 : V1); + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, V, + DAG.getIntPtrConstant(0, dl)); + M %= NumElements; + if (!Half1 || Half1 == V) { + HalfMask.push_back(M); + Half1 = V; + continue; + } + if (!Half2 || Half2 == V) { + HalfMask.push_back(M + HalfNumElts); + Half2 = V; + continue; + } + break; + } + + if (HalfMask.size() == HalfNumElts) { + assert(Half1 && "Unexpected undefined shuffle"); + Half2 = (Half2 ? Half2 : DAG.getUNDEF(HalfVT)); + SDValue V = DAG.getVectorShuffle(HalfVT, dl, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(0, dl)); + } + } + } + // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); @@ -22457,39 +22525,8 @@ return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle or - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -22563,24 +22600,6 @@ return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle or - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -26349,7 +26368,7 @@ // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. - // FIXME: Check rounding control flags as well once it becomes available. + // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -159,11 +159,11 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3284,6 +3284,24 @@ ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -325,7 +325,7 @@ ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2013,6 +2013,34 @@ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; ALL-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -236,23 +236,11 @@ } define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0423: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_0423: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_0423: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0423: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -488,10 +476,19 @@ ret <4 x double> %shuffle } +define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_15uu: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_11uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -750,21 +747,21 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0142: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: retq @@ -778,21 +775,21 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0412: ; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0412: ; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1134,21 +1131,20 @@ ret <4 x i64> %shuffle } +define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_15uu: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_11uu: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_11uu: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i64_11uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4i64_11uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -73,10 +73,10 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00040000: ; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_00040000: @@ -864,6 +864,24 @@ ret <8 x float> %shuffle } +define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1188uuuu: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1111uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_5555uuuu: ; AVX1: # BB#0: @@ -949,10 +967,10 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00040000: ; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_00040000: @@ -1961,6 +1979,25 @@ ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2222uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + + +define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2A3Buuuu: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_44444444: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -1167,7 +1167,7 @@ ; ; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: retq entry: @@ -1251,7 +1251,7 @@ ; ; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: retq entry: @@ -1319,7 +1319,7 @@ ; ; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -1366,7 +1366,7 @@ ; ; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -1428,7 +1428,7 @@ ; ; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: retq entry: