diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16035,6 +16035,140 @@
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
 }
 
+static SDValue lowerShuffleAsBlendWithBroadcast(
+    const SDLoc &DL, ArrayRef<int> Mask, const APInt &Zeroable, MVT VT,
+    SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(Mask.size() == NumElts && "Mask size mismatch?");
+
+  // The scalar element type must be legal for us to succeed.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT.getScalarType()))
+    return SDValue();
+
+  // Don't get stuck reprocessing the broadcast.
+  if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
+    return SDValue();
+
+  // Can we theoretically broadcast this element type?
+  // Note that for AVX1 we must be able to fold the load!
+  if (!((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
+        (Subtarget.hasAVX() && EltSizeInBits >= 32)))
+    return SDValue();
+
+  enum class EltStatus { Undef, Blendable, BroadcastCandidate };
+  SmallVector<EltStatus, 8> DecipheredMask;
+  DecipheredMask.reserve(Mask.size());
+  transform(enumerate(Mask), std::back_inserter(DecipheredMask),
+            [NumElts](auto I) {
+              unsigned EltIdx = I.index();
+              int MaskElt = I.value();
+
+              assert(isUndefOrInRange(MaskElt, 0, 2 * NumElts) &&
+                     "Malformed shuffle?");
+
+              if (MaskElt < 0)
+                return EltStatus::Undef;
+              if ((unsigned)MaskElt == EltIdx ||
+                  (unsigned)MaskElt == NumElts + EltIdx)
+                return EltStatus::Blendable;
+              // FIXME: what if inputs are broadcasts?
+              return EltStatus::BroadcastCandidate;
+            });
+  assert(DecipheredMask.size() == Mask.size() && "Mask size shouldn't change.");
+
+  // If this already looks like a blend, then don't bother any further.
+  if (!is_contained(DecipheredMask, EltStatus::BroadcastCandidate))
+    return SDValue();
+
+  // Are all non-blendable elements picking the same scalar value?
+  Optional<unsigned> CandidateMaskElt;
+  for (auto I : zip(DecipheredMask, Mask)) {
+    EltStatus Dsc = std::get<0>(I);
+    int MaskElt = std::get<1>(I);
+
+    if (Dsc != EltStatus::BroadcastCandidate)
+      continue;
+
+    // FIXME: what if inputs are broadcasts?
+    if (!CandidateMaskElt)
+      CandidateMaskElt = MaskElt;
+    else if (*CandidateMaskElt != (unsigned)MaskElt)
+      return SDValue();
+  }
+  assert(CandidateMaskElt &&
+         "Should have found a single non-blendable mask element!");
+
+  int ToBeBroadcastedInputIdx = *CandidateMaskElt / NumElts;
+  int ToBeBroadcastedInputEltIdx = *CandidateMaskElt % NumElts;
+
+  // We can only freely broadcast the 0'th element, so let's abort otherwise.
+  if (ToBeBroadcastedInputEltIdx != 0)
+    return SDValue();
+
+  // Strike-out all CandidateMaskElt mask elements.
+  SmallVector<int, 8> AdjustedMask(Mask.begin(), Mask.end());
+  for (auto I : zip(DecipheredMask, AdjustedMask)) {
+    EltStatus &Dsc = std::get<0>(I);
+    int &MaskElt = std::get<1>(I);
+
+    if ((unsigned)MaskElt == *CandidateMaskElt) {
+      Dsc = EltStatus::BroadcastCandidate;
+      MaskElt = -1;
+    }
+  }
+
+  // The remaining mask needs to be a single-source mask.
+  if (!ShuffleVectorInst::isSingleSourceMask(AdjustedMask))
+    return SDValue();
+
+  assert(is_contained(DecipheredMask, EltStatus::Blendable) &&
+         "Whole shuffle simplified into a broadcast?");
+
+  assert(ShuffleVectorInst::isIdentityMask(AdjustedMask) &&
+         "By now the remaining mask can only be an identity mask.");
+
+  std::array<SDValue, 2> Inputs = {V1, V2};
+
+  // From which input does the to-be-broadcasted element comes?
+  SDValue ToBeBroadcastedInput = Inputs[ToBeBroadcastedInputIdx];
+
+  // Also, if we don't have AVX2 broadcast-from-reg,
+  // we must be able to fold the load.
+  if (!Subtarget.hasAVX2() && !MayFoldLoad(ToBeBroadcastedInput))
+    return SDValue();
+
+  // Which input do we keep as baseline identity?
+  int IdentityInputIdx = *find_if(AdjustedMask, [](int MaskElt) {
+    return MaskElt != -1;
+  }) / NumElts;
+  SDValue IdentityInput = Inputs[IdentityInputIdx];
+
+  // Okay, this can be represented as blend with broadcast!
+  SDValue ScalarElt = DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, DL, VT.getScalarType(), ToBeBroadcastedInput,
+      DAG.getIntPtrConstant(ToBeBroadcastedInputEltIdx, DL));
+  SDValue InputSplatVec = DAG.getSplatBuildVector(VT, DL, ScalarElt);
+  SmallVector<int, 8> BlendMask;
+  BlendMask.reserve(NumElts);
+  transform(enumerate(DecipheredMask), std::back_inserter(BlendMask),
+            [NumElts](auto I) -> int {
+              int EltIdx = I.index();
+              EltStatus Dsc = I.value();
+
+              switch (Dsc) {
+              case EltStatus::Undef:
+                return -1;
+              case EltStatus::BroadcastCandidate:
+                return EltIdx + NumElts;
+              default:
+                return EltIdx;
+              }
+            });
+  assert(BlendMask.size() == Mask.size() && "Mask size shouldn't change.");
+  return DAG.getVectorShuffle(VT, DL, IdentityInput, InputSplatVec, BlendMask);
+}
+
 /// Either split a vector in halves or decompose the shuffles and the
 /// blend/unpack.
 ///
@@ -17128,6 +17262,8 @@
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
 
+    // FIXME: we should run `lowerShuffleAsBlendWithBroadcast()` here.
+
     // Try to permute the lanes and then use a per-lane permute.
     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
                                                         Mask, DAG, Subtarget))
@@ -17151,6 +17287,12 @@
                                           Zeroable, Subtarget, DAG))
     return Op;
 
+  // See if this shuffle can be represented as a broadcast of 0'th element
+  // of some input, and a blend between said blend and an input.
+  if (SDValue Blend = lowerShuffleAsBlendWithBroadcast(
+          DL, Mask, Zeroable, MVT::v4f64, V1, V2, Subtarget, DAG))
+    return Blend;
+
   // If we have lane crossing shuffles AND they don't all come from the lower
   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
@@ -17273,6 +17415,12 @@
   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
     return V;
 
+  // See if this shuffle can be represented as a broadcast of 0'th element
+  // of some input, and a blend between said blend and an input.
+  if (SDValue Blend = lowerShuffleAsBlendWithBroadcast(
+          DL, Mask, Zeroable, MVT::v4i64, V1, V2, Subtarget, DAG))
+    return Blend;
+
   // If we have one input in place, then we can permute the other input and
   // blend the result.
   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -151,8 +151,8 @@
 define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; CHECK-NEXT:    retq
   %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x double> %r
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -268,11 +268,11 @@
 ; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm1
+; AVX2-SLOW-NEXT:    vhaddps %xmm0, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -289,11 +289,11 @@
 ; AVX2-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
+; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vhaddps %xmm7, %xmm6, %xmm1
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    retq
   %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
   %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
@@ -463,8 +463,7 @@
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
-; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm2
-; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -1677,18 +1677,44 @@
 }
 
 define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
-; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vbroadcastsd (%eax), %ymm1
-; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
-; X86-NEXT:    retl
+; X86-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vbroadcastsd (%eax), %ymm1
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
+; X86-AVX1-NEXT:    retl
 ;
-; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
-; X64:       # %bb.0:
-; X64-NEXT:    vbroadcastsd (%rdi), %ymm1
-; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
-; X64-NEXT:    retq
+; X86-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; X86-AVX2-NEXT:    retl
+;
+; X86-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastsd (%rdi), %ymm1
+; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; X64-AVX512-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
   %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -468,11 +468,11 @@
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vbroadcastsd %xmm1, %ymm3
 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 ; X86-AVX2-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
 ; X86-AVX2-NEXT:    vmovapd %ymm3, (%edx)
 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
@@ -495,8 +495,8 @@
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
+; X86-AVX512-NEXT:    vbroadcastsd %xmm1, %ymm3
+; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3]
 ; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 ; X86-AVX512-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
 ; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0]
@@ -538,11 +538,11 @@
 ;
 ; X64-AVX2-LABEL: PR48908:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vbroadcastsd %xmm1, %ymm3
 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 ; X64-AVX2-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
 ; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
 ; X64-AVX2-NEXT:    vmovapd %ymm3, (%rdi)
 ; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
@@ -562,8 +562,8 @@
 ; X64-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; X64-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; X64-AVX512-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
+; X64-AVX512-NEXT:    vbroadcastsd %xmm1, %ymm3
+; X64-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3]
 ; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 ; X64-AVX512-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,3,8,1]