Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11877,6 +11877,89 @@ return V; } + // If this shuffle only has a single input that is a bitcasted shuffle, + // attempt to merge the 2 shuffles and suitably bitcast the inputs/output + // back to their original types. + if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && + N1.getOpcode() == ISD::UNDEF && Level < AfterLegalizeVectorOps && + TLI.isTypeLegal(VT)) { + + // Peek through the bitcast only if there is one user. + SDValue BC0 = N0; + while (BC0.getOpcode() == ISD::BITCAST) { + if (!BC0.hasOneUse()) + break; + BC0 = BC0.getOperand(0); + } + + auto ScaleShuffleMask = [](ArrayRef Mask, int Scale) { + if (Scale == 1) + return SmallVector(Mask.begin(), Mask.end()); + + SmallVector NewMask; + for (int M : Mask) + for (int s = 0; s != Scale; ++s) + NewMask.push_back(M < 0 ? -1 : Scale * M + s); + return NewMask; + }; + + if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { + EVT SVT = VT.getScalarType(); + EVT InnerVT = BC0->getValueType(0); + EVT InnerSVT = InnerVT.getScalarType(); + + // Determine which shuffle works with the smaller scalar type. + EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; + EVT ScaleSVT = ScaleVT.getScalarType(); + + if (TLI.isTypeLegal(ScaleVT) && + 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && + 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { + + int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); + int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); + + // Scale the shuffle masks to the smaller scalar type. + ShuffleVectorSDNode *InnerSVN = cast(BC0); + SmallVector InnerMask = + ScaleShuffleMask(InnerSVN->getMask(), InnerScale); + SmallVector OuterMask = + ScaleShuffleMask(SVN->getMask(), OuterScale); + + // Merge the shuffle masks. + SmallVector NewMask; + for (int M : OuterMask) + NewMask.push_back(M < 0 ? -1 : InnerMask[M]); + + // Test for shuffle mask legality over both commutations. + SDValue SV0 = BC0->getOperand(0); + SDValue SV1 = BC0->getOperand(1); + bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); + if (!LegalMask) { + for (int i = 0, e = (int)NewMask.size(); i != e; ++i) { + int idx = NewMask[i]; + if (idx < 0) + continue; + else if (idx < e) + NewMask[i] = idx + e; + else + NewMask[i] = idx - e; + } + std::swap(SV0, SV1); + LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); + } + + if (LegalMask) { + SV0 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV0); + SV1 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV1); + return DAG.getNode( + ISD::BITCAST, SDLoc(N), VT, + DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); + } + } + } + } + // Canonicalize shuffles according to rules: // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) Index: llvm/trunk/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll +++ llvm/trunk/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-pc-win32 | FileCheck %s - -; CHECK: test -; CHECK: vpmovzxwd -; CHECK: vpmovzxwd -define void @test(<4 x i64> %a, <4 x i16>* %buf) { - %ex1 = extractelement <4 x i64> %a, i32 0 - %ex2 = extractelement <4 x i64> %a, i32 1 - %x1 = bitcast i64 %ex1 to <4 x i16> - %x2 = bitcast i64 %ex2 to <4 x i16> - %Sh = shufflevector <4 x i16> %x1, <4 x i16> %x2, <4 x i32> - store <4 x i16> %Sh, <4 x i16>* %buf, align 1 - ret void -} Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1336,3 +1336,22 @@ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_bitcast_unpack: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_bitcast_unpack: +; AVX: # BB#0: +; AVX-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float> + %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> + %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16> + %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> + %bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8> + ret <16 x i8> %bitcast8 +} Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -810,6 +810,25 @@ ret <2 x double> %shuffle } +define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_bitcast_1z: +; SSE: # BB#0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_bitcast_1z: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX-NEXT: retq + %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> + %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float> + %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> + %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x double> + ret <2 x double> %bitcast64 +} + define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) { ; SSE-LABEL: insert_reg_and_zero_v2i64: ; SSE: # BB#0: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1574,6 +1574,23 @@ ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_bitcast_0415: +; SSE: # BB#0: +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_bitcast_0415: +; AVX: # BB#0: +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double> + %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> + %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32> + ret <4 x i32> %bitcast32 +} + define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { ; SSE-LABEL: insert_reg_and_zero_v4i32: ; SSE: # BB#0: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -922,3 +922,22 @@ %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %1 } + +define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: bitcast_v4f64_0426: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitcast_v4f64_0426: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: retq + %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float> + %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> + %bitcast16 = bitcast <8 x float> %shuffle32 to <16 x i16> + %shuffle16 = shufflevector <16 x i16> %bitcast16, <16 x i16> undef, <16 x i32> + %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double> + ret <4 x double> %bitcast64 +} Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -8,14 +8,14 @@ ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-NEXT: movlpd %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test0: ; X64: ## BB#0: ## %entry ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -84,16 +84,15 @@ ; X32: ## BB#0: ## %entry ; X32-NEXT: movl L_tmp_V2i$non_lazy_ptr, %eax ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movlpd %xmm0, (%eax) +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: movlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test2: ; X64: ## BB#0: ## %entry ; X64-NEXT: movq _tmp_V2i@{{.*}}(%rip), %rax ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X64-NEXT: movq %xmm0, (%rax) ; X64-NEXT: retq entry: