Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8313,6 +8313,52 @@ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); } +/// \brief Try to lower shuffle mask to UNPCK{LH} instruction. +/// +/// This function attempts to match a shuffle mask to a +/// UNPCKL or UNPCKH instruction, using all combinations of the +/// input operands. +static SDValue lowerVectorUnpack(SDLoc DL, MVT VT, SDValue V1, SDValue V2, + ArrayRef Mask, SelectionDAG &DAG) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); + + auto Lower = [&](unsigned Op, int Offset) -> SDValue { + // UNPCK: X[Ofs], Y[Ofs], X[Ofs+1], Y[Ofs+1], ... + // Attempt to match both operands to the 'X' and 'Y' input + // operands of an UNPCK instruction. + // Note that these are repeated 128-bit lane unpacks. + bool MatchV1X = true, MatchV2X = true; + bool MatchV1Y = true, MatchV2Y = true; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l + Offset; i != NumLaneElts; i += 2, ++j) { + int X = Mask[l + i]; + int Y = Mask[l + i + 1]; + MatchV1X &= isUndefOrEqual(X, j); + MatchV1Y &= isUndefOrEqual(Y, j); + MatchV2X &= isUndefOrEqual(X, j + NumElts); + MatchV2Y &= isUndefOrEqual(Y, j + NumElts); + } + } + + if ((MatchV1X || MatchV2X) && (MatchV1Y || MatchV2Y)) + return DAG.getNode(Op, DL, VT, MatchV1X ? V1 : V2, MatchV1Y ? V1 : V2); + + // No match found. + return SDValue(); + }; + + if (SDValue S = Lower(X86ISD::UNPCKL, 0)) + return S; + if (SDValue S = Lower(X86ISD::UNPCKH, NumLaneElts / 2)) + return S; + + // No match found. + return SDValue(); +} + // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which @@ -8432,11 +8478,9 @@ assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v2f64, V1, V2, Mask, DAG)) + return S; // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { @@ -8529,11 +8573,9 @@ return Insertion; } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v2i64, V1, V2, Mask, DAG)) + return S; if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, @@ -8693,11 +8735,9 @@ getV4X86ShuffleImm8ForMask(Mask, DAG)); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v4f32, V1, V2, Mask, DAG)) + return S; // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that @@ -8795,11 +8835,9 @@ lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Masked; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return S; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8874,11 +8912,9 @@ DL, MVT::v8i16, V, V, Mask, DAG)) return Shift; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); - if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v8i16, V, V, Mask, DAG)) + return S; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -9511,11 +9547,9 @@ lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Masked; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return S; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -10456,12 +10490,9 @@ DAG); } - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v4f64, V1, V2, Mask, DAG)) + return S; // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. @@ -10563,11 +10594,9 @@ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return S; } // AVX2 provides a direct instruction for permuting a single input across @@ -10631,11 +10660,9 @@ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v8f32, V1, V2, Mask, DAG)) + return S; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -10727,11 +10754,9 @@ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return S; } // If the shuffle patterns aren't repeated but it is a single input, directly @@ -10793,19 +10818,9 @@ Subtarget, DAG)) return Blend; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - // First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(Mask, - // First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v16i16, V1, V2, Mask, DAG)) + return S; if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i16 @@ -10880,23 +10895,9 @@ Subtarget, DAG)) return Blend; - // Use dedicated unpack instructions for masks that match their pattern. - // Note that these are repeated 128-bit lane unpacks, not unpacks across all - // 256-bit lanes. - if (isShuffleEquivalent( - Mask, - // First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); - if (isShuffleEquivalent( - Mask, - // First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v32i8, V1, V2, Mask, DAG)) + return S; if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i8 @@ -10994,12 +10995,9 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v8f64, V1, V2, Mask, DAG)) + return S; // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); @@ -11016,15 +11014,9 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - 0, 16, 1, 17, 4, 20, 5, 21, - 8, 24, 9, 25, 12, 28, 13, 29)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(Mask, - 2, 18, 3, 19, 6, 22, 7, 23, - 10, 26, 11, 27, 14, 30, 15, 31)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v16f32, V1, V2, Mask, DAG)) + return S; // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); @@ -11041,12 +11033,9 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v8i64, V1, V2, Mask, DAG)) + return S; // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); @@ -11063,15 +11052,9 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - 0, 16, 1, 17, 4, 20, 5, 21, - 8, 24, 9, 25, 12, 28, 13, 29)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(Mask, - 2, 18, 3, 19, 6, 22, 7, 23, - 10, 26, 11, 27, 14, 30, 15, 31)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + // Try to use dedicated unpack instructions. + if (SDValue S = lowerVectorUnpack(DL, MVT::v16i32, V1, V2, Mask, DAG)) + return S; // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1074,6 +1074,21 @@ ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_40u1: +; SSE: # BB#0: +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_40u1: +; AVX: # BB#0: +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_3456: ; SSE2: # BB#0: Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -407,10 +407,7 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -431,10 +428,7 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1125,10 +1119,7 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1149,10 +1140,7 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle