Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -19034,6 +19034,95 @@ return SDValue(); } +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector getPSHUFShuffleMask(SDValue N) { + SmallVector Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + default: + return SDValue(); + } + + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT == MVT::v8i16); + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. + if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && + areAdjacentMasksSequential(Mask)) { + int DMask[] = {-1, -1, -1, -1}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + Mask[0] / 2; + DMask[DOffset + 1] = DOffset + Mask[2] / 2; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + getV4X86ShuffleImm8ForMask(DMask, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + } + + // Fallthrough + case X86ISD::PSHUFD: + if (V.getOpcode() == N.getOpcode()) { + // If we have two sequential shuffles of the same kind we can always fold + // them. Even if there are multiple uses, this is beneficial because it + // breaks a dependency. + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + break; + } + + return SDValue(); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -19158,7 +19247,18 @@ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + if (LD.getNode()) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to Index: llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll +++ llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll @@ -1,30 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep -- -86 - -define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind { -entry: - alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:0 [#uses=167] - alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:1 [#uses=170] - alloca [4 x <4 x i32>] ; <[4 x <4 x i32>]*>:2 [#uses=12] - %.sub6235.i = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0 ; <<4 x float>*> [#uses=76] - %.sub.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0 ; <<4 x float>*> [#uses=59] - - %tmp124.i1062.i = getelementptr <4 x float>* %tmp116117.i1061.i, i32 63 ; <<4 x float>*> [#uses=1] - %tmp125.i1063.i = load <4 x float>* %tmp124.i1062.i ; <<4 x float>> [#uses=5] - %tmp828.i1077.i = shufflevector <4 x float> %tmp125.i1063.i, <4 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ; <<4 x float>> [#uses=4] - %tmp704.i1085.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1] - %tmp712.i1086.i = call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %tmp704.i1085.i, <4 x float> %tmp828.i1077.i ) ; <<4 x float>> [#uses=1] - store <4 x float> %tmp712.i1086.i, <4 x float>* %.sub.i - - %tmp2587.i1145.gep.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0, i32 2 ; [#uses=1] - %tmp5334.i = load float* %tmp2587.i1145.gep.i ; [#uses=5] - %tmp2723.i1170.i = insertelement <4 x float> undef, float %tmp5334.i, i32 2 ; <<4 x float>> [#uses=5] - store <4 x float> %tmp2723.i1170.i, <4 x float>* %.sub6235.i - - %tmp1406.i1367.i = shufflevector <4 x float> %tmp2723.i1170.i, <4 x float> undef, <4 x i32> < i32 2, i32 2, i32 2, i32 2 > ; <<4 x float>> [#uses=1] - %tmp84.i1413.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1] - %tmp89.i1415.i = fmul <4 x float> %tmp84.i1413.i, %tmp1406.i1367.i ; <<4 x float>> [#uses=1] - store <4 x float> %tmp89.i1415.i, <4 x float>* %.sub.i - ret i16 0 -} - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -51,7 +51,7 @@ ; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745 ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -159,7 +159,7 @@ ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -273,8 +273,7 @@ ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,3,0,1,4,5,6,7] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -285,8 +284,7 @@ ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -297,8 +295,7 @@ ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,3,0,1,4,5,6,7] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> @@ -359,9 +356,8 @@ define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879 ; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[2,3,0,1,4,5,6,7] ; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> @@ -458,8 +454,7 @@ ; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle