Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10306,6 +10306,98 @@ return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256/512 bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + assert((VT.getSizeInBits() % 256) == 0 && + "Expected 256-bit vector or larger"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle or + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle or + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if ((M % NumElts) >= HalfNumElts) + break; + + int HalfIdx = (M / HalfNumElts); + M %= NumElts; + + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(M); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(M + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + break; + } + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + if (HalfMask.size() == HalfNumElts) { + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); + } + + return SDValue(); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -10875,6 +10967,10 @@ DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, DAG)) + return V; + // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have @@ -11089,6 +11185,10 @@ assert(Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, DAG)) + return V; + // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) @@ -19866,7 +19966,7 @@ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); // Use the original mask here, do not modify the mask twice - Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); // The value that should be stored MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); @@ -22608,39 +22708,8 @@ return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle or - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -22714,24 +22783,6 @@ return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle or - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -26501,7 +26552,7 @@ // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. - // FIXME: Check rounding control flags as well once it becomes available. + // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -99,7 +99,7 @@ ; CHECK-LABEL: funcF: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovd %edi, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -159,11 +159,11 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3250,6 +3250,22 @@ ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # BB#0: @@ -3284,6 +3300,24 @@ ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -325,7 +325,7 @@ ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2013,6 +2013,34 @@ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; ALL-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -236,23 +236,11 @@ } define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0423: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_0423: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_0423: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0423: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -488,10 +476,19 @@ ret <4 x double> %shuffle } +define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_15uu: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_11uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -734,13 +731,15 @@ ; ; AVX2-LABEL: shuffle_v4i64_0124: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0124: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -750,21 +749,21 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0142: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: retq @@ -778,21 +777,21 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0412: ; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0412: ; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -947,14 +946,16 @@ ; ; AVX2-LABEL: shuffle_v4i64_1251: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_1251: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: retq @@ -1134,21 +1135,20 @@ ret <4 x i64> %shuffle } +define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_15uu: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_11uu: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_11uu: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i64_11uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4i64_11uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -73,10 +73,10 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00040000: ; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_00040000: @@ -832,17 +832,11 @@ } define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { -; AVX1-LABEL: shuffle_v8f32_uuuu1111: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v8f32_uuuu1111: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu1111: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -864,6 +858,44 @@ ret <8 x float> %shuffle } +define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1188uuuu: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_uuuu3210: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_uuuu1188: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1111uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_5555uuuu: ; AVX1: # BB#0: @@ -949,10 +981,10 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00040000: ; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_00040000: @@ -1948,19 +1980,37 @@ define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_uuuu1111: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_uuuu1111: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2222uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2A3Buuuu: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_44444444: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -154,8 +154,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,1,2,19,u,u,u,u,u,u,u,u,u,u,u,u> -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -23,21 +23,19 @@ ret <32 x i16> %c } -define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: +define <32 x i16> @shuffle_v32i16_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { +; ALL-LABEL: shuffle_v32i16_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = <0,32,1,33,2,34,3,35,8,40,9,41,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; ALL-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c } -define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: +define <32 x i16> @shuffle_v32i16_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { +; ALL-LABEL: shuffle_v32i16_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = <4,36,5,37,6,38,7,39,12,44,13,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; ALL-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -1167,7 +1167,7 @@ ; ; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: retq entry: @@ -1251,7 +1251,7 @@ ; ; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: retq entry: @@ -1319,7 +1319,7 @@ ; ; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -1366,7 +1366,7 @@ ; ; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -1428,7 +1428,7 @@ ; ; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: retq entry: