Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -8090,6 +8090,37 @@
   return Zeroable;
 }
 
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+                                     ArrayRef<int> Mask,const EVT &VectorType,
+                                     bool &IsZeroSideLeft) {
+  int NextElement = -1;
+  // Check if the Mask's nonzero elements are in increasing order.
+  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+    // Checks if the mask's zeros elements are built from only zeros.
+    if (Mask[i] == -1)
+      return false;
+    if (Zeroable[i])
+      continue;
+    // Find the lowest non zero element
+    if (NextElement == -1) {
+      NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+      IsZeroSideLeft = NextElement != 0;
+    }
+    // Exit if the mask's non zero elements are not in increasing order.
+    if (NextElement != Mask[i])
+      return false;
+    NextElement++;
+  }
+  return true;
+}
+
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
@@ -8145,6 +8176,46 @@
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+  unsigned convertBit = 0;
+  for (int i = 0, e = Zeroable.size(); i < e; i++)
+    convertBit |= !(Zeroable[i]) << i;
+  return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+                                          const SmallBitVector &Zeroable,
+                                          ArrayRef<int> Mask, SDValue &V1,
+                                          SDValue &V2, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget) {
+  bool IsLeftZeroSide = true;
+  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+                                IsLeftZeroSide))
+    return SDValue();
+  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  MVT IntegerType =
+      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+  unsigned NumElts = VT.getVectorNumElements();
+  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+         "Unexpected number of vector elements");
+  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+                              Subtarget, DAG, DL);
+  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+  SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                     ZeroVector);
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -12159,6 +12230,11 @@
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
@@ -12222,12 +12298,17 @@
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or VEXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                       Mask, Subtarget, DAG))
@@ -12328,6 +12409,11 @@
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
@@ -12392,12 +12478,17 @@
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or EXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12754,6 +12845,7 @@
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable, 
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12796,11 +12888,16 @@
       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Op;
 
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable, 
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12832,6 +12929,10 @@
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
@@ -12889,6 +12990,10 @@
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
@@ -12953,6 +13058,10 @@
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
@@ -13089,9 +13198,9 @@
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
-    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
-    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i32:
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefix=KNL
+
+;expand 128 -> 256 include <4 x float> <2 x double>
+define <8 x float> @expand(<4 x float> %a) {
+; SKX-LABEL: expand:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $5, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
+   ret <8 x float> %res
+}
+
+define <8 x float> @expand1(<4 x float> %a ) {
+; SKX-LABEL: expand1:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-86, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand1:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
+; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+   ret <8 x float> %res
+}
+
+;Expand 128 -> 256 test <2 x double> -> <4 x double>
+define <4 x double> @expand2(<2 x double> %a) {
+; SKX-LABEL: expand2:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand2:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
+   ret <4 x double> %res
+}
+
+;expand 128 -> 256 include case <4 x i32> <8 x i32>
+define <8 x i32> @expand3(<4 x i32> %a ) {
+; SKX-LABEL: expand3:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand3:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
+   ret <8 x i32> %res
+}
+
+;expand 128 -> 256 include case <2 x i64> <4 x i64>
+define <4 x i64> @expand4(<2 x i64> %a ) {
+; SKX-LABEL: expand4:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand4:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
+   ret <4 x i64> %res
+}
+
+;Negative test for 128-> 256
+define <8 x float> @expand5(<4 x float> %a ) {
+; SKX-LABEL: expand5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vbroadcastss %xmm0, %ymm0
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand5:
+; KNL:       # BB#0:
+; KNL-NEXT:    vbroadcastss %xmm0, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+   ret <8 x float> %res
+}
+
+;expand 256 -> 512 include <8 x float> <16 x float>
+define <8 x float> @expand6(<4 x float> %a ) {
+; SKX-LABEL: expand6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand6:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   ret <8 x float> %res
+}
+
+define <16 x float> @expand7(<8 x float> %a) {
+; SKX-LABEL: expand7:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $1285, %ax # imm = 0x505
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand7:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $1285, %ax # imm = 0x505
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand8(<8 x float> %a ) {
+; SKX-LABEL: expand8:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand8:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x float> %res
+}
+
+;expand 256 -> 512 include <4 x double> <8 x double>
+define <8 x double> @expand9(<4 x double> %a) {
+; SKX-LABEL: expand9:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand9:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x double> %res
+}
+
+define <16 x i32> @expand10(<8 x i32> %a ) {
+; SKX-LABEL: expand10:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand10:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x i32> %res
+}
+
+define <8 x i64> @expand11(<4 x i64> %a) {
+; SKX-LABEL: expand11:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand11:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x i64> %res
+}
+
+;Negative test for 256-> 512
+define <16 x float> @expand12(<8 x float> %a) {
+; SKX-LABEL: expand12:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; SKX-NEXT:    vxorps %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand12:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand13(<8 x float> %a ) {
+; SKX-LABEL: expand13:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand13:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   ret <16 x float> %res
+}
+
+; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
+
+define <8 x float> @expand14(<4 x float> %a) {
+; SKX-LABEL: expand14:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $20, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand14:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}
+
+;Negative test.
+define <8 x float> @expand15(<4 x float> %a) {
+; SKX-LABEL: expand15:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
+; SKX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
+; SKX-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand15:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}