Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -19061,6 +19061,79 @@ } } +/// \brief Search for a combinable shuffle across a chain ending in pshufd. +/// +/// We walk up the chain and look for a combinable shuffle, skipping over +/// shuffles that we could hoist this shuffle's transformation past without +/// altering anything. +static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N.getOpcode() == X86ISD::PSHUFD && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFD: + // Found another dword shuffle. + break; + + case X86ISD::PSHUFLW: + // Check that the low words (being shuffled) are the identity in the + // dword shuffle, and the high words are self-contained. + if (Mask[0] != 0 || Mask[1] != 1 || + !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) + return false; + + continue; + + case X86ISD::PSHUFHW: + // Check that the high words (being shuffled) are the identity in the + // dword shuffle, and the low words are self-contained. + if (Mask[2] != 2 || Mask[3] != 3 || + !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) + return false; + + continue; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask. + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking @@ -19194,18 +19267,11 @@ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); } - // Fallthrough + break; + case X86ISD::PSHUFD: - if (V.getOpcode() == N.getOpcode()) { - // If we have two sequential shuffles of the same kind we can always fold - // them. Even if there are multiple uses, this is beneficial because it - // breaks a dependency. - SmallVector VMask = getPSHUFShuffleMask(V); - for (int &M : Mask) - M = VMask[M]; - return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0), - getV4X86ShuffleImm8ForMask(Mask, DAG)); - } + if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle. break; } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -157,9 +157,8 @@ ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3,9 +3,69 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" +declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) +define <4 x i32> @combine_pshufd1(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd1 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) + ret <4 x i32> %c +} + +define <4 x i32> @combine_pshufd2(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd2 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + ret <4 x i32> %d +} + +define <4 x i32> @combine_pshufd3(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd3 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + ret <4 x i32> %d +} + +define <4 x i32> @combine_pshufd4(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd4 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) + ret <4 x i32> %d +} + +define <4 x i32> @combine_pshufd5(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd5 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) + ret <4 x i32> %d +} + define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { ; CHECK-SSE2-LABEL: @combine_pshuflw1 ; CHECK-SSE2: # BB#0: