Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13427,6 +13427,60 @@ } /// Lower a vector shuffle crossing multiple 128-bit lanes as +/// a lane permutation followed by a per-lane permutation. +/// +/// This is mainly for cases where we can have non-repeating permutes +/// in each lane. +/// +/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes, +/// we should investigate merging them. +static SDValue lowerVectorShuffleAsLanePermuteAndPermute( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + + SmallVector SrcLaneMask(NumLanes, SM_SentinelUndef); + SmallVector LaneMask(NumElts, SM_SentinelUndef); + SmallVector PermMask(NumElts, SM_SentinelUndef); + + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + + // Ensure that each lane comes from a single source lane. + int SrcLane = M / NumEltsPerLane; + int DstLane = i / NumEltsPerLane; + if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane)) + return SDValue(); + SrcLaneMask[DstLane] = SrcLane; + + LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane); + PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane); + } + + // If we're only shuffling a single lowest lane and the rest are identity + // then don't bother. + // TODO - isShuffleMaskInputInPlace could be extended to something like this. + int NumIdentityLanes = 0; + bool OnlyShuffleLowestLane = true; + for (int i = 0; i != NumLanes; ++i) { + if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane, + i * NumEltsPerLane)) + NumIdentityLanes++; + else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes) + OnlyShuffleLowestLane = false; + } + if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) + return SDValue(); + + SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask); + return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); +} + +/// Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// /// This essentially blends the out-of-lane inputs to each lane into the lane @@ -14162,6 +14216,11 @@ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; + // Try to permute the lanes and then use a per-lane permute. + if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) + return V; + // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget); @@ -14196,6 +14255,7 @@ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -91,9 +91,8 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_1000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_1000: @@ -174,10 +173,8 @@ define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_2233: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_2233: @@ -766,9 +763,8 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_1000: