Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -8217,6 +8217,11 @@ getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + // If we have a single input from V2 insert that into V1 if we can do so // cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { @@ -8243,11 +8248,6 @@ Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v2i64, V1, V2, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8508,6 +8508,11 @@ getV4X86ShuffleImm8ForMask(Mask, DAG)); } + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, @@ -8525,11 +8530,6 @@ Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v4i32, V1, V2, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8593,17 +8593,17 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V, V, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) @@ -9210,6 +9210,11 @@ assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, @@ -9227,11 +9232,6 @@ Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V1, V2, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) Index: llvm/trunk/test/CodeGen/X86/combine-or.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-or.ll +++ llvm/trunk/test/CodeGen/X86/combine-or.ll @@ -271,9 +271,8 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test21: ; CHECK: # BB#0: -; CHECK-NEXT: orps %xmm1, %xmm0 -; CHECK-NEXT: movq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -667,14 +667,12 @@ define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) { ; SSE-LABEL: shuffle_v2i64_1z: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2i64_1z: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle @@ -683,14 +681,12 @@ define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) { ; SSE-LABEL: shuffle_v2i64_z0: ; SSE: # BB#0: -; SSE-NEXT: movq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2i64_z0: ; AVX: # BB#0: -; AVX-NEXT: vmovq %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -775,37 +775,27 @@ define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_zuu4: ; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_zuu4: ; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_zuu4: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4i32_zuu4: ; SSE41: # BB#0: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_zuu4: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1429,15 +1429,13 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zuuzuuz8: ; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %edi, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zuuzuuz8: ; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %edi, %xmm0 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0