Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8010,6 +8010,37 @@ return Zeroable; } +// The Shuffle result is as follow: +// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. +// Each Zeroable's element correspond to a particular Mask's element. +// As described in computeZeroableShuffleElements function. +// +// The function looks for a sub-mask that the nonzero elements are in +// increasing order. If such sub-mask exist. The function returns true. +static bool isNonZeroElementsInOrder(SmallBitVector Zeroable, + ArrayRef Mask, EVT &VectorType, + bool &IsZeroSideLeft) { + int NextElement = -1; + // Check if the Mask's nonzero elements are in increasing order. + for (int i = 0, e = Zeroable.size(); i < e; i++) { + // Checks if the mask's zeros elements are built from only zeros. + if (Mask[i] == -1) + return false; + if (Zeroable[i]) + continue; + // Find the lowest non zero element + if (NextElement == -1) { + NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; + IsZeroSideLeft = NextElement != 0; + } + // Exit if the mask's non zero elements are not in increasing order. + if (NextElement != Mask[i]) + return false; + NextElement++; + } + return true; +} + /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, @@ -8065,6 +8096,46 @@ DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl); + +// Function convertBitVectorToUnsigned - The function gets SmallBitVector +// as argument and convert him to unsigned. +// The output of the function is not(zeroable) +static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { + unsigned convertBit = 0; + for (int i = 0, e = Zeroable.size(); i < e; i++) + convertBit |= !(Zeroable[i]) << i; + return convertBit; +} + +// X86 has dedicated shuffle that can be lowered to VEXPAND +static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, + const SmallBitVector &Zeroable, + ArrayRef Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsLeftZeroSide = true; + if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), + IsLeftZeroSide)) + return SDValue(); + unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); + unsigned NumElts = VT.getVectorNumElements(); + assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && + "Unexpected number of vector elements"); + SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), + Subtarget, DAG, DL); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); + SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; + return DAG.getNode(ISD::VSELECT, DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, @@ -12082,6 +12153,11 @@ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. @@ -12147,12 +12223,17 @@ Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or VEXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) @@ -12253,6 +12334,11 @@ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. @@ -12317,12 +12403,17 @@ Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or EXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) @@ -12640,6 +12731,7 @@ /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12682,11 +12774,16 @@ lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12718,6 +12815,10 @@ // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } @@ -12775,6 +12876,10 @@ if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -12839,6 +12944,10 @@ CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -12969,9 +13078,9 @@ // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: Index: test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-avx512.ll +++ test/CodeGen/X86/vector-shuffle-avx512.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=KNL + +;expand 128 -> 256 include <4 x float> <2 x double> +define <8 x float> @expand(<4 x float> %a) { +; SKX-LABEL: expand: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $5, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand: +; KNL: # BB#0: +; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] +; KNL-NEXT: retq + %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> + ret <8 x float> %res +} + +define <8 x float> @expand1(<4 x float> %a ) { +; SKX-LABEL: expand1: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $-86, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand1: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vmovaps {{.*#+}} ymm1 = +; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; KNL-NEXT: retq + %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +;Expand 128 -> 256 test <2 x double> -> <4 x double> +define <4 x double> @expand2(<2 x double> %a) { +; SKX-LABEL: expand2: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $9, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand2: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] +; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; KNL-NEXT: retq + %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> + ret <4 x double> %res +} + +;expand 128 -> 256 include case <4 x i32> <8 x i32> +define <8 x i32> @expand3(<4 x i32> %a ) { +; SKX-LABEL: expand3: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $-127, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand3: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] +; KNL-NEXT: retq + %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> + ret <8 x i32> %res +} + +;expand 128 -> 256 include case <2 x i64> <4 x i64> +define <4 x i64> @expand4(<2 x i64> %a ) { +; SKX-LABEL: expand4: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $9, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand4: +; KNL: # BB#0: +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; KNL-NEXT: retq + %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> + ret <4 x i64> %res +} + +;Negative test for 128-> 256 +define <8 x float> @expand5(<4 x float> %a ) { +; SKX-LABEL: expand5: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %ymm0 +; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; SKX-NEXT: retq +; +; KNL-LABEL: expand5: +; KNL: # BB#0: +; KNL-NEXT: vbroadcastss %xmm0, %ymm0 +; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; KNL-NEXT: retq + %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +;expand 256 -> 512 include <8 x float> <16 x float> +define <8 x float> @expand6(<4 x float> %a ) { +; SKX-LABEL: expand6: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0 +; SKX-NEXT: retq +; +; KNL-LABEL: expand6: +; KNL: # BB#0: +; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; KNL-NEXT: retq + %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +define <16 x float> @expand7(<8 x float> %a) { +; SKX-LABEL: expand7: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movw $1285, %ax # imm = 0x505 +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand7: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movw $1285, %ax # imm = 0x505 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq + %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> + ret <16 x float> %res +} + +define <16 x float> @expand8(<8 x float> %a ) { +; SKX-LABEL: expand8: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand8: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq + %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> + ret <16 x float> %res +} + +;expand 256 -> 512 include <4 x double> <8 x double> +define <8 x double> @expand9(<4 x double> %a) { +; SKX-LABEL: expand9: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movb $-127, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand9: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movb $-127, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq + %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> + ret <8 x double> %res +} + +define <16 x i32> @expand10(<8 x i32> %a ) { +; SKX-LABEL: expand10: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand10: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq + %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> + ret <16 x i32> %res +} + +define <8 x i64> @expand11(<4 x i64> %a) { +; SKX-LABEL: expand11: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movb $-127, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand11: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movb $-127, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq + %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> + ret <8 x i64> %res +} + +;Negative test for 256-> 512 +define <16 x float> @expand12(<8 x float> %a) { +; SKX-LABEL: expand12: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] +; SKX-NEXT: vxorps %zmm1, %zmm1, %zmm1 +; SKX-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq +; +; KNL-LABEL: expand12: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] +; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: retq + %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> + ret <16 x float> %res +} + +define <16 x float> @expand13(<8 x float> %a ) { +; SKX-LABEL: expand13: +; SKX: # BB#0: +; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0 +; SKX-NEXT: retq +; +; KNL-LABEL: expand13: +; KNL: # BB#0: +; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: retq + %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> + ret <16 x float> %res +} + +; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector. + +define <8 x float> @expand14(<4 x float> %a) { +; SKX-LABEL: expand14: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $20, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand14: +; KNL: # BB#0: +; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> +; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] +; KNL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] +; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; KNL-NEXT: retq + %addV = fadd <4 x float> , + %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +;Negative test. +define <8 x float> @expand15(<4 x float> %a) { +; SKX-LABEL: expand15: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SKX-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> +; SKX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] +; SKX-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX-NEXT: retq +; +; KNL-LABEL: expand15: +; KNL: # BB#0: +; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> +; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] +; KNL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] +; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; KNL-NEXT: retq + %addV = fadd <4 x float> , + %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> + ret <8 x float> %res +}