Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8010,6 +8010,33 @@ return Zeroable; } +// The Shuffle result is as follow: +// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. +// Each Zeroable's element correspond to a particular Mask's element. +// As described in computeZeroableShuffleElements function. +// +// The function looks for a sub-mask that the nonzero elements are in +// increasing order. If such sub-mask exist. The function returns true. +static bool isNonZeroElementsInOrder(SmallBitVector Zeroable, + ArrayRef Mask, EVT &VectorType, + bool &IsZeroSideLeft) { + int MaskElement = -1; + // Check if the Mask's nonzero elements are in increasing order. + for (int i = 0; i < Zeroable.size(); i++) { + if (!Zeroable[i]) { + // Find the lowest non zero element + if (MaskElement == -1) { + MaskElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; + IsZeroSideLeft = MaskElement != 0; + } + if (MaskElement != Mask[i]) + return false; + MaskElement++; + } + } + return true; +} + /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, @@ -8065,6 +8092,48 @@ DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl); + +// Fucntion convertBitVectorToUnsigned - The fucntion gets SmallBitVector +// as argumant and convert him to unsigned. +// The output of the function is not(zeroable) +static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { + unsigned convertBit = 0; + for (int i = 0; i < Zeroable.size(); i++) + convertBit |= !(Zeroable[i]) << i; + return convertBit; +} + +// X86 has dedicated shuffle that can be lowered to VEXPAND +static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, + const SmallBitVector &Zeroable, + ArrayRef Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsLeftZeroSide = true; + if (isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), + IsLeftZeroSide)) { + unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); + unsigned numberOfElements = VT.getVectorNumElements(); + assert((numberOfElements == 4 || numberOfElements == 8 || + numberOfElements == 16) && + "Unexpected number of vector elements"); + SDValue VMask = + getMaskNode(MaskNode, VT.getVectorVT(MVT::i1, numberOfElements), + Subtarget, DAG, DL); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); + SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; + return DAG.getNode(ISD::VSELECT, DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), ZeroVector); + } + return SDValue(); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, @@ -12082,6 +12151,11 @@ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. @@ -12147,12 +12221,17 @@ Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or VEXPAND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) @@ -12253,6 +12332,11 @@ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. @@ -12317,12 +12401,17 @@ Zeroable, Subtarget, DAG)) return Shift; - // If we have VLX support, we can use VALIGN. - if (Subtarget.hasVLX()) + // If we have VLX support, we can use VALIGN or EXAPND. + if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) @@ -12640,7 +12729,8 @@ /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, - SDValue V1, SDValue V2, + SmallBitVector Zeroable, SDValue V1, + SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); @@ -12682,12 +12772,17 @@ lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef Mask, - SDValue V1, SDValue V2, + SmallBitVector Zeroable, SDValue V1, + SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); @@ -12718,6 +12813,10 @@ // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } @@ -12775,6 +12874,10 @@ if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -12839,6 +12942,10 @@ CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -12969,9 +13076,9 @@ // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: Index: test/CodeGen/X86/shuffle-to-expand.ll =================================================================== --- test/CodeGen/X86/shuffle-to-expand.ll +++ test/CodeGen/X86/shuffle-to-expand.ll @@ -0,0 +1,239 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=KNL + + +;expand 128 -> 256 include <4 x float> <2 x double> +define <8 x float> @expand(<4 x float> %a) { +; SKX-LABEL: expand: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $5, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand: +; KNL-NOT: vexpandps %ymm0, %ymm0 {%k1} {z} + %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> + ret <8 x float> %res +} + +define <8 x float> @expand1(<4 x float> %a ) { +; SKX-LABEL: expand1: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $-86, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand1: +; KNL-NOT: vexpandps + %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +;Expand 128 -> 256 test <2 x double> -> <4 x double> +define <4 x double> @expand2(<2 x double> %a) { +; SKX-LABEL: expand2: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $9, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand2: +; KNL-NOT: vexpandpd + + %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> + ret <4 x double> %res +} + +;expand 128 -> 256 include case <4 x i32> <8 x i32> +define <8 x i32> @expand3(<4 x i32> %a ) { +; SKX-LABEL: expand3: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $-127, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand3: +; KNL-NOT: vpexpandd + %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> + ret <8 x i32> %res +} + +;expand 128 -> 256 include case <2 x i64> <4 x i64> +define <4 x i64> @expand4(<2 x i64> %a ) { +; SKX-LABEL: expand4: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: movb $9, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand4: +; KNL-NOT: vpexpandq + %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> + ret <4 x i64> %res +} + +;Negativ test for 128-> 256 +define <8 x float> @expand5(<4 x float> %a ) { +; SKX-LABEL: expand5: +; SKX: # BB#0: +; SKX-NOT: vpexpan{.*} +; +; KNL-LABEL: expand5: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +;expand 256 -> 512 include <8 x float> <16 x float> +define <8 x float> @expand6(<4 x float> %a ) { +; SKX-LABEL: expand6: +; SKX-NOT: vpexpan{.*} +; +; KNL-LABEL: expand6: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +define <16 x float> @expand7(<8 x float> %a) { +; SKX-LABEL: expand7: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movw $1285, %ax # imm = 0x505 +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand7: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> + ret <16 x float> %res +} + +define <16 x float> @expand8(<8 x float> %a ) { +; SKX-LABEL: expand8: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand8: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> + ret <16 x float> %res +} + +;expand 256 -> 512 include <4 x double> <8 x double> +define <8 x double> @expand9(<4 x double> %a) { +; SKX-LABEL: expand9: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movb $-127, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand9: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> + ret <8 x double> %res +} + +define <16 x i32> @expand10(<8 x i32> %a ) { +; SKX-LABEL: expand10: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand10: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> + ret <16 x i32> %res +} + +define <8 x i64> @expand11(<4 x i64> %a) { +; SKX-LABEL: expand11: +; SKX: # BB#0: +; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: movb $-127, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; KNL-LABEL: expand11: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> + ret <8 x i64> %res +} + +;Negativ test for 256-> 512 +define <16 x float> @expand12(<8 x float> %a ) { +; SKX-LABEL: expand12: +; SKX: # BB#0: +; SKX-NOT: vpexpan{.*} +; +; KNL-LABEL: expand12: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> + ret <16 x float> %res +} + +define <16 x float> @expand13(<8 x float> %a ) { +; SKX-LABEL: expand13: +; SKX-NOT: vpexpan{.*} +; +; KNL-LABEL: expand13: +; KNL-NOT: vpexpan{.*} + %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> + ret <16 x float> %res +} + +; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector. + +define <8 x float> @expand14(<4 x float> %a) { +; SKX-LABEL: expand14: +; SKX: # BB#0: +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: vmovdqa64 {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> +; SKX-NEXT: movb $20, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vexpandps %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovdqa64 %ymm1, %ymm0 +; SKX-NEXT: retq +; +; KNL-LABEL: expand14: +; KNL-NOT: vexpandps + %addV = fadd <4 x float> , + %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> + ret <8 x float> %res +} + +;Negative test. +define <8 x float> @expand15(<4 x float> %a) { +; SKX-LABEL: expand15: +; SKX: # BB#0: +; SKX-NOT: vexpandps +; +; KNL-LABEL: expand15: +; KNL: # BB#0: +; KNL-NOT: vexpandps + %addV = fadd <4 x float> , + %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> + ret <8 x float> %res +}