Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7802,6 +7802,79 @@ return SDValue(); } +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q) +/// SSE2 and AVX2 logical bit-shift instructions. The function matches +/// elements from one of the input vectors shuffled to the left or right +/// with zeroable elements 'shifted in'. +static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + int Size = Mask.size(); + assert(Size == VT.getVectorNumElements() && "Unexpected mask size"); + + // PSRL : (little-endian) right bit shift. + // [ 1, zz, 3, zz] + // [ -1, -1, 7, zz] + // PSHL : (little-endian) left bit shift. + // [ zz, 0, zz, 2 ] + // [ -1, 4, zz, -1 ] + auto MatchBitShift = [&](int Shift, int Scale) -> SDValue { + MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); + MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); + assert(TLI.isTypeLegal(ShiftVT) && "Illegal integer vector type"); + + bool MatchLeft = true, MatchRight = true; + for (int i = 0; i != Size; i += Scale) { + for (int j = 0; j != Shift; j++) { + MatchLeft &= Zeroable[i + j]; + } + for (int j = Scale - Shift; j != Scale; j++) { + MatchRight &= Zeroable[i + j]; + } + } + if (!(MatchLeft || MatchRight)) + return SDValue(); + + bool MatchV1 = true, MatchV2 = true; + for (int i = 0; i != Size; i += Scale) { + unsigned Pos = MatchLeft ? i + Shift : i; + unsigned Low = MatchLeft ? i : i + Shift; + unsigned Len = Scale - Shift; + MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low); + MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size); + } + if (!(MatchV1 || MatchV2)) + return SDValue(); + + // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again. + unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI; + int ShiftAmt = Shift * VT.getScalarSizeInBits(); + SDValue V = MatchV1 ? V1 : V2; + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + + // SSE/AVX supports logical shifts up to 64-bit integers - so we can just + // keep doubling the size of the integer elements up to that. We can + // then shift the elements of the integer vector by whole multiples of + // their width within the elements of the larger integer vector. Test each + // multiple to see if we can find a match with the moved element indices + // and that the shifted in elements are all zeroable. + for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2) + for (int Shift = 1; Shift != Scale; Shift++) + if (SDValue BitShift = MatchBitShift(Shift, Scale)) + return BitShift; + + // no match + return SDValue(); +} + /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension @@ -8623,6 +8696,11 @@ getV4X86ShuffleImm8ForMask(Mask, DAG)); } + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v4i32, V1, V2, Mask, DAG)) @@ -8708,6 +8786,11 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v8i16, V, V, Mask, DAG)) @@ -9325,6 +9408,11 @@ assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v8i16, V1, V2, Mask, DAG)) @@ -9481,6 +9569,11 @@ ArrayRef OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + return Shift; + // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v16i8, V1, V2, OrigMask, DAG)) @@ -10526,6 +10619,11 @@ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10595,6 +10693,11 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10670,6 +10773,11 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Blend; Index: test/CodeGen/X86/combine-or.ll =================================================================== --- test/CodeGen/X86/combine-or.ll +++ test/CodeGen/X86/combine-or.ll @@ -206,12 +206,10 @@ define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test17: ; CHECK: # BB#0: +; CHECK-NEXT: psllq $32, %xmm0 ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; CHECK-NEXT: orps %xmm1, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -63,7 +63,7 @@ define <16 x i8> @t5(<16 x i8> %x) nounwind { ; CHECK-LABEL: t5: ; CHECK: # BB#0: -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: psrlw $8, %xmm0 ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %s @@ -72,7 +72,7 @@ define <16 x i8> @t6(<16 x i8> %x) nounwind { ; CHECK-LABEL: t6: ; CHECK: # BB#0: -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: psrlw $8, %xmm0 ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> ret <16 x i8> %s @@ -86,3 +86,21 @@ %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> ret <16 x i8> %s } + +define <16 x i8> @t8(<16 x i8> %x) nounwind { +; CHECK-LABEL: t8: +; CHECK: # BB#0: +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @t9(<16 x i8> %x) nounwind { +; CHECK-LABEL: t9: +; CHECK: # BB#0: +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> + ret <16 x i8> %s +} Index: test/CodeGen/X86/vector-idiv.ll =================================================================== --- test/CodeGen/X86/vector-idiv.ll +++ test/CodeGen/X86/vector-idiv.ll @@ -109,11 +109,11 @@ ; AVX-LABEL: test2: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 @@ -958,11 +958,11 @@ ; AVX-LABEL: test9: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 @@ -1049,11 +1049,11 @@ ; AVX-LABEL: test10: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 @@ -1158,11 +1158,11 @@ ; AVX-LABEL: test11: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 ; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -573,21 +573,21 @@ ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSSE3-NEXT: pslld $24, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSE41-NEXT: pslld $24, %xmm0 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; AVX-NEXT: vpslld $24, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 @@ -1053,6 +1053,108 @@ ret void } +; +; Shuffle to logical bit shifts +; + +define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; SSE: # BB#0: +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $8, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; SSE: # BB#0: +; SSE-NEXT: pslld $24, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; AVX: # BB#0: +; AVX-NEXT: vpslld $24, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: +; SSE: # BB#0: +; SSE-NEXT: psllq $56, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $56, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: +; SSE: # BB#0: +; SSE-NEXT: psllq $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $8, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: +; SSE: # BB#0: +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlw $8, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrld $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: psrlq $56, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $56, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; SSE2-LABEL: PR12412: ; SSE2: # BB#0: # %entry Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1359,3 +1359,35 @@ %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> ret <4 x float> %shuffle } + +; +; Shuffle to logical bit shifts +; + +define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) { +; SSE-LABEL: shuffle_v4i32_z0zX: +; SSE: # BB#0: +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_z0zX: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) { +; SSE-LABEL: shuffle_v4i32_1z3z: +; SSE: # BB#0: +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_1z3z: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1917,3 +1917,118 @@ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } + +; +; Shuffle to logical bit shifts +; +define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_z0z2z4z6: +; SSE: # BB#0: +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z0z2z4z6: +; AVX: # BB#0: +; AVX-NEXT: vpslld $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_zzz0zzz4: +; SSE: # BB#0: +; SSE-NEXT: psllq $48, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzz0zzz4: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $48, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_zz01zX4X: +; SSE: # BB#0: +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zz01zX4X: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_z0X2z456: +; SSE: # BB#0: +; SSE-NEXT: psllq $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z0X2z456: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_1z3zXz7z: +; SSE: # BB#0: +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_1z3zXz7z: +; AVX: # BB#0: +; AVX-NEXT: vpsrld $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_1X3z567z: +; SSE: # BB#0: +; SSE-NEXT: psrlq $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_1X3z567z: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_23zz67zz: +; SSE: # BB#0: +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23zz67zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_3zXXXzzz: +; SSE: # BB#0: +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_3zXXXzzz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $48, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1362,3 +1362,107 @@ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } + +; +; Shuffle to logical bit shifts +; + +define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $48, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,2,3,2,3,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $32, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1654,3 +1654,145 @@ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } + +; +; Shuffle to logical bit shifts +; + +define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $8, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,0,1,128,128,4,5,128,128,8,9,128,128,12,13] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,0,1,128,128,128,128,128,128,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0],zero,zero,xmm3[0,0,0,0,0,0],zero,zero +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $48, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $8, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,128,128,6,7,128,128,10,11,128,128,14,15,128,128] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0] +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,128,128,128,15,128,128,128,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm3[0,0,0],zero,xmm3[0,0,0,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $56, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1850,6 +1850,42 @@ ret <8 x float> %1 } +; +; Shuffle to logical bit shifts +; + +define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_z0U2zUz6: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_z0U2zUz6: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $32, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_1U3z5zUU: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_1U3z5zUU: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $32, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %shuffle +} + define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_1: ; ALL: # BB#0: # %entry