Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3851,7 +3851,7 @@ return true; } -bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; @@ -3873,7 +3873,7 @@ /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified -/// sequential range (L, L+Pos]. or is undef. +/// sequential range (L, L+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) @@ -6036,7 +6036,7 @@ return NewLd; } - + //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && @@ -7023,7 +7023,7 @@ // Check for a build vector of consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; - + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. @@ -7693,17 +7693,6 @@ int Size = Mask.size(); int Scale = 16 / Size; - auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset, - ArrayRef Mask) { - for (int i = StartIndex; i < EndIndex; i++) { - if (Mask[i] < 0) - continue; - if (i + Base != Mask[i] - MaskOffset) - return false; - } - return true; - }; - for (int Shift = 1; Shift < Size; Shift++) { int ByteShift = Shift * Scale; @@ -7717,8 +7706,10 @@ } if (ZeroableRight) { - bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask); - bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask); + bool ValidShiftRight1 = + isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); + bool ValidShiftRight2 = + isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); if (ValidShiftRight1 || ValidShiftRight2) { // Cast the inputs to v2i64 to match PSRLDQ. @@ -7740,8 +7731,10 @@ } if (ZeroableLeft) { - bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask); - bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask); + bool ValidShiftLeft1 = + isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); + bool ValidShiftLeft2 = + isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); if (ValidShiftLeft1 || ValidShiftLeft2) { // Cast the inputs to v2i64 to match PSLLDQ. @@ -7757,6 +7750,114 @@ return SDValue(); } +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q) +/// SSE2 and AVX2 logical bit-shift instructions. The function matches +/// elements from one of the input vectors shuffled to the left or right +/// with zeroable elements 'shifted in'. +static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + const MVT::SimpleValueType ShiftMapping[][2] = {// SSE2 + {MVT::v4i32, MVT::v2i64}, + {MVT::v8i16, MVT::v4i32}, + {MVT::v8i16, MVT::v2i64}, + {MVT::v16i8, MVT::v8i16}, + {MVT::v16i8, MVT::v4i32}, + {MVT::v16i8, MVT::v2i64}, + // AVX2 + {MVT::v8i32, MVT::v4i64}, + {MVT::v16i16, MVT::v8i32}, + {MVT::v16i16, MVT::v4i64}, + {MVT::v32i8, MVT::v16i16}, + {MVT::v32i8, MVT::v8i32}, + {MVT::v32i8, MVT::v4i64}}; + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + for (auto map : ShiftMapping) { + if (VT.SimpleTy != map[0]) + continue; + + MVT ShiftVT = MVT(map[1]); + int Size = ShiftVT.getVectorNumElements(); + int Scale = VT.getVectorNumElements() / Size; + + // We can shift the elements of the integer vector by whole multiples of + // their width within the elements of the larger integer vector. Test each + // multiple to see if we can find a match with the moved element indices + // and that the shifted in elements are all zeroable. + for (int Shift = 1; Shift != Scale; Shift++) { + int ShiftAmt = Shift * VT.getScalarSizeInBits(); + + // PSRL : (little-endian) right bit shift. + // [ 1, zz, 3, zz] + // [ -1, -1, 7, zz] + bool ZeroableRight = true; + for (int i = 0, e = Size * Scale; i != e; i += Scale) { + for (int j = Scale - Shift; j < Scale; j++) { + ZeroableRight &= Zeroable[i + j]; + } + } + + if (ZeroableRight) { + bool ValidShiftRight1 = true; + bool ValidShiftRight2 = true; + + for (unsigned i = 0, e = Size * Scale; i != e; i += Scale) { + ValidShiftRight1 &= isSequentialOrUndefInRange( + Mask, i, Scale - Shift, i + Shift); + ValidShiftRight2 &= isSequentialOrUndefInRange( + Mask, i, Scale - Shift, i + Shift + Mask.size()); + } + + if (ValidShiftRight1 || ValidShiftRight2) { + // Cast the inputs to ShiftVT to match VSRLI and then back again. + SDValue &TargetV = ValidShiftRight1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSRLI, DL, ShiftVT, V, + DAG.getConstant(ShiftAmt, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + + // PSHL : (little-endian) left bit shift. + // [ zz, 0, zz, 2 ] + // [ -1, 4, zz, -1 ] + bool ZeroableLeft = true; + for (int i = 0, e = Size * Scale; i != e; i += Scale) { + for (int j = 0; j < Shift; j++) { + ZeroableLeft &= Zeroable[i + j]; + } + } + + if (ZeroableLeft) { + bool ValidShiftLeft1 = true; + bool ValidShiftLeft2 = true; + + for (int i = 0, e = Size * Scale; i != e; i += Scale) { + ValidShiftLeft1 &= isSequentialOrUndefInRange( + Mask, i + Shift, Scale - Shift, i); + ValidShiftLeft2 &= isSequentialOrUndefInRange( + Mask, i + Shift, Scale - Shift, i + Mask.size()); + } + + if (ValidShiftLeft1 || ValidShiftLeft2) { + // Cast the inputs to ShiftVT to match VSHLI and then back again. + SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSHLI, DL, ShiftVT, V, + DAG.getConstant(ShiftAmt, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + } + } + + return SDValue(); +} + /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension @@ -8530,6 +8631,11 @@ DL, MVT::v4i32, V1, V2, Mask, DAG)) return Shift; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, @@ -8615,6 +8721,11 @@ DL, MVT::v8i16, V, V, Mask, DAG)) return Shift; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); @@ -9232,6 +9343,11 @@ DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, @@ -9388,6 +9504,11 @@ DL, MVT::v16i8, V1, V2, OrigMask, DAG)) return Shift; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + return Shift; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) @@ -10398,6 +10519,11 @@ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10467,6 +10593,11 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10542,6 +10673,11 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit shift instructions. + if (SDValue Shift = lowerVectorShuffleAsBitShift( + DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -16815,7 +16951,7 @@ /// The mask is comming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using -/// "X86select" instead of "vselect". We just can't create the "vselect" node for +/// "X86select" instead of "vselect". We just can't create the "vselect" node for /// a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, @@ -22590,7 +22726,7 @@ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vals[4]; SDLoc dl(InputVector); - + if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); @@ -22599,7 +22735,7 @@ SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(1, VecIdxTy)); - SDValue ShAmt = DAG.getConstant(32, + SDValue ShAmt = DAG.getConstant(32, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Index: test/CodeGen/X86/combine-or.ll =================================================================== --- test/CodeGen/X86/combine-or.ll +++ test/CodeGen/X86/combine-or.ll @@ -203,18 +203,16 @@ ; Verify that the dag-combiner does not fold a OR of two shuffles into a single ; shuffle instruction when the shuffle indexes are not compatible. -define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test17: -; CHECK: # BB#0: -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; CHECK-NEXT: orps %xmm1, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm0 -; CHECK-NEXT: retq - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> +define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test17: +; CHECK: # BB#0: +; CHECK-NEXT: psllq $32, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } Index: test/CodeGen/X86/vector-idiv.ll =================================================================== --- test/CodeGen/X86/vector-idiv.ll +++ test/CodeGen/X86/vector-idiv.ll @@ -106,17 +106,17 @@ ; SSE-NEXT: psrld $2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test2: -; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpsrld $2, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -956,17 +956,17 @@ ; SSE-NEXT: paddd %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test9: -; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 +; AVX-LABEL: test9: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 ; AVX-NEXT: vpsrad $2, %ymm0, %ymm0 ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -1047,17 +1047,17 @@ ; SSE-NEXT: psubd %xmm4, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test10: -; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 -; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX-LABEL: test10: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vpsrld $2, %ymm1, %ymm1 ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 @@ -1156,17 +1156,17 @@ ; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test11: -; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 +; AVX-LABEL: test11: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 ; AVX-NEXT: vpsrad $2, %ymm1, %ymm1 ; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1085,6 +1085,108 @@ %shuffle.i = shufflevector <16 x i8> , <16 x i8> %weird_zero, <16 x i32> %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32> store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16 - store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16 - ret void -} + store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16 + ret void +} + +; +; Shuffle to logical bit shifts +; + +define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; SSE: # BB#0: +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $8, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; SSE: # BB#0: +; SSE-NEXT: pslld $24, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; AVX: # BB#0: +; AVX-NEXT: vpslld $24, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: +; SSE: # BB#0: +; SSE-NEXT: psllq $56, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $56, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: +; SSE: # BB#0: +; SSE-NEXT: psllq $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $8, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: +; SSE: # BB#0: +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlw $8, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrld $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: psrlq $56, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $56, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1356,6 +1356,38 @@ ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] ; AVX-NEXT: retq %a = load <4 x float>* %ptr - %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> - ret <4 x float> %shuffle -} + %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle +} + +; +; Shuffle to logical bit shifts +; + +define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) { +; SSE-LABEL: shuffle_v4i32_z0zX: +; SSE: # BB#0: +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_z0zX: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) { +; SSE-LABEL: shuffle_v4i32_1z3z: +; SSE: # BB#0: +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_1z3z: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1914,6 +1914,121 @@ ; AVX: # BB#0: ; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 ; AVX-NEXT: retq - %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> - ret <8 x i16> %shuffle -} + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +; +; Shuffle to logical bit shifts +; +define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_z0z2z4z6: +; SSE: # BB#0: +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z0z2z4z6: +; AVX: # BB#0: +; AVX-NEXT: vpslld $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_zzz0zzz4: +; SSE: # BB#0: +; SSE-NEXT: psllq $48, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzz0zzz4: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $48, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_zz01zX4X: +; SSE: # BB#0: +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zz01zX4X: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_z0X2z456: +; SSE: # BB#0: +; SSE-NEXT: psllq $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z0X2z456: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_1z3zXz7z: +; SSE: # BB#0: +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_1z3zXz7z: +; AVX: # BB#0: +; AVX-NEXT: vpsrld $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_1X3z567z: +; SSE: # BB#0: +; SSE-NEXT: psrlq $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_1X3z567z: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $16, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_23zz67zz: +; SSE: # BB#0: +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23zz67zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $32, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) { +; SSE-LABEL: shuffle_v8i16_3zXXXzzz: +; SSE: # BB#0: +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_3zXXXzzz: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $48, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1359,6 +1359,110 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq - %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> - ret <16 x i16> %shuffle -} + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + +; +; Shuffle to logical bit shifts +; + +define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $48, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,2,3,2,3,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $32, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> + ret <16 x i16> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1651,6 +1651,148 @@ ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq - %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> - ret <32 x i8> %shuffle -} + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +; +; Shuffle to logical bit shifts +; + +define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $8, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,0,1,128,128,4,5,128,128,8,9,128,128,12,13] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,0,1,128,128,128,128,128,128,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0],zero,zero,xmm3[0,0,0,0,0,0],zero,zero +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $48, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $8, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,128,128,6,7,128,128,10,11,128,128,14,15,128,128] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0] +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrld $16, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,128,128,128,15,128,128,128,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm3[0,0,0],zero,xmm3[0,0,0,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $56, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1846,6 +1846,42 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; AVX2-NEXT: retq - %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer - ret <8 x float> %1 -} + %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %1 +} + +; +; Shuffle to logical bit shifts +; + +define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_z0U2zUz6: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_z0U2zUz6: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $32, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_1U3z5zUU: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_1U3z5zUU: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $32, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %shuffle +}