Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -8313,6 +8313,52 @@
   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
 }
 
+/// \brief Try to lower shuffle mask to UNPCK{LH} instruction.
+///
+/// This function attempts to match a shuffle mask to a
+/// UNPCKL or UNPCKH instruction, using all combinations of the
+/// input operands.
+static SDValue lowerVectorUnpack(SDLoc DL, MVT VT, SDValue V1, SDValue V2,
+                                 ArrayRef<int> Mask, SelectionDAG &DAG) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
+
+  auto Lower = [&](unsigned Op, int Offset) -> SDValue {
+    // UNPCK: X[Ofs], Y[Ofs], X[Ofs+1], Y[Ofs+1], ...
+    // Attempt to match both operands to the 'X' and 'Y' input
+    // operands of an UNPCK instruction.
+    // Note that these are repeated 128-bit lane unpacks.
+    bool MatchV1X = true, MatchV2X = true;
+    bool MatchV1Y = true, MatchV2Y = true;
+    for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+      for (unsigned i = 0, j = l + Offset; i != NumLaneElts; i += 2, ++j) {
+        int X = Mask[l + i];
+        int Y = Mask[l + i + 1];
+        MatchV1X &= isUndefOrEqual(X, j);
+        MatchV1Y &= isUndefOrEqual(Y, j);
+        MatchV2X &= isUndefOrEqual(X, j + NumElts);
+        MatchV2Y &= isUndefOrEqual(Y, j + NumElts);
+      }
+    }
+
+    if ((MatchV1X || MatchV2X) && (MatchV1Y || MatchV2Y))
+      return DAG.getNode(Op, DL, VT, MatchV1X ? V1 : V2, MatchV1Y ? V1 : V2);
+
+    // No match found.
+    return SDValue();
+  };
+
+  if (SDValue S = Lower(X86ISD::UNPCKL, 0))
+    return S;
+  if (SDValue S = Lower(X86ISD::UNPCKH, NumLaneElts / 2))
+    return S;
+
+  // No match found.
+  return SDValue();
+}
+
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
 // INSERTPS when the V1 elements are already in the correct locations
 // because otherwise we can just always use two SHUFPS instructions which
@@ -8432,11 +8478,9 @@
   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v2f64, V1, V2, Mask, DAG))
+    return S;
 
   // If we have a single input, insert that into V1 if we can do so cheaply.
   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
@@ -8529,11 +8573,9 @@
       return Insertion;
   }
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return S;
 
   if (Subtarget->hasSSE41())
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
@@ -8693,11 +8735,9 @@
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   }
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v4f32, V1, V2, Mask, DAG))
+    return S;
 
   // There are special ways we can lower some single-element blends. However, we
   // have custom ways we can lower more complex single-element blends below that
@@ -8795,11 +8835,9 @@
           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
     return Masked;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return S;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8874,11 +8912,9 @@
           DL, MVT::v8i16, V, V, Mask, DAG))
     return Shift;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
-  if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v8i16, V, V, Mask, DAG))
+    return S;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
@@ -9511,11 +9547,9 @@
           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return Masked;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
-  if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return S;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
@@ -10456,12 +10490,9 @@
                                                    DAG);
   }
 
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v4f64, V1, V2, Mask, DAG))
+    return S;
 
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
@@ -10563,11 +10594,9 @@
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
     }
 
-    // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
-    if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+    // Try to use dedicated unpack instructions.
+    if (SDValue S = lowerVectorUnpack(DL, MVT::v4i64, V1, V2, Mask, DAG))
+      return S;
   }
 
   // AVX2 provides a direct instruction for permuting a single input across
@@ -10631,11 +10660,9 @@
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
 
-    // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+    // Try to use dedicated unpack instructions.
+    if (SDValue S = lowerVectorUnpack(DL, MVT::v8f32, V1, V2, Mask, DAG))
+      return S;
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
     // have already handled any direct blends. We also need to squash the
@@ -10727,11 +10754,9 @@
       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
 
-    // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+    // Try to use dedicated unpack instructions.
+    if (SDValue S = lowerVectorUnpack(DL, MVT::v8i32, V1, V2, Mask, DAG))
+      return S;
   }
 
   // If the shuffle patterns aren't repeated but it is a single input, directly
@@ -10793,19 +10818,9 @@
                                                 Subtarget, DAG))
     return Blend;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
-                          // First 128-bit lane:
-                          0, 16, 1, 17, 2, 18, 3, 19,
-                          // Second 128-bit lane:
-                          8, 24, 9, 25, 10, 26, 11, 27))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
-  if (isShuffleEquivalent(Mask,
-                          // First 128-bit lane:
-                          4, 20, 5, 21, 6, 22, 7, 23,
-                          // Second 128-bit lane:
-                          12, 28, 13, 29, 14, 30, 15, 31))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v16i16, V1, V2, Mask, DAG))
+    return S;
 
   if (isSingleInputShuffleMask(Mask)) {
     // There are no generalized cross-lane shuffle operations available on i16
@@ -10880,23 +10895,9 @@
                                                 Subtarget, DAG))
     return Blend;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  // Note that these are repeated 128-bit lane unpacks, not unpacks across all
-  // 256-bit lanes.
-  if (isShuffleEquivalent(
-          Mask,
-          // First 128-bit lane:
-          0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
-          // Second 128-bit lane:
-          16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
-  if (isShuffleEquivalent(
-          Mask,
-          // First 128-bit lane:
-          8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-          // Second 128-bit lane:
-          24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v32i8, V1, V2, Mask, DAG))
+    return S;
 
   if (isSingleInputShuffleMask(Mask)) {
     // There are no generalized cross-lane shuffle operations available on i8
@@ -10994,12 +10995,9 @@
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v8f64, V1, V2, Mask, DAG))
+    return S;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
@@ -11016,15 +11014,9 @@
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
-                          0, 16, 1, 17, 4, 20, 5, 21,
-                          8, 24, 9, 25, 12, 28, 13, 29))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
-  if (isShuffleEquivalent(Mask,
-                          2, 18, 3, 19, 6, 22, 7, 23,
-                          10, 26, 11, 27, 14, 30, 15, 31))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v16f32, V1, V2, Mask, DAG))
+    return S;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
@@ -11041,12 +11033,9 @@
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v8i64, V1, V2, Mask, DAG))
+    return S;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
@@ -11063,15 +11052,9 @@
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
-                          0, 16, 1, 17, 4, 20, 5, 21,
-                          8, 24, 9, 25, 12, 28, 13, 29))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
-  if (isShuffleEquivalent(Mask,
-                          2, 18, 3, 19, 6, 22, 7, 23,
-                          10, 26, 11, 27, 14, 30, 15, 31))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+  // Try to use dedicated unpack instructions.
+  if (SDValue S = lowerVectorUnpack(DL, MVT::v16i32, V1, V2, Mask, DAG))
+    return S;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
Index: test/CodeGen/X86/vector-shuffle-128-v4.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1074,6 +1074,21 @@
   ret <4 x i32> %shuffle
 }
 
+define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_40u1:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_40u1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
+  ret <4 x i32> %shuffle
+}
+
 define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: shuffle_v4i32_3456:
 ; SSE2:       # BB#0:
Index: test/CodeGen/X86/vector-shuffle-512-v8.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -407,10 +407,7 @@
 define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_00224466:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   ret <8 x double> %shuffle
@@ -431,10 +428,7 @@
 define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_11335577:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   ret <8 x double> %shuffle
@@ -1125,10 +1119,7 @@
 define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_00224466:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   ret <8 x i64> %shuffle
@@ -1149,10 +1140,7 @@
 define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_11335577:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   ret <8 x i64> %shuffle