Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -22490,6 +22490,7 @@ } /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -22499,6 +22500,7 @@ SDValue V2 = SVOp->getOperand(1); MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); + unsigned HalfNumElems = NumElems / 2; if (V1.getOpcode() == ISD::CONCAT_VECTORS && V2.getOpcode() == ISD::CONCAT_VECTORS) { @@ -22523,9 +22525,9 @@ // To match the shuffle mask, the first half of the mask should // be exactly the first vector, and all the rest a splat with the // first element of the second one. - for (unsigned i = 0; i != NumElems/2; ++i) + for (unsigned i = 0; i != HalfNumElems; ++i) if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || - !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) + !isUndefOrEqual(SVOp->getMaskElt(i+ HalfNumElems), NumElems)) return SDValue(); // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. @@ -22569,7 +22571,7 @@ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); + SDValue V = Extract128BitVector(V1, HalfNumElems, DAG, dl); SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); return DCI.CombineTo(N, InsV); } @@ -22577,10 +22579,40 @@ // vector_shuffle or if (isShuffleLow128VectorInsertHigh(SVOp)) { SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, HalfNumElems, DAG, dl); return DCI.CombineTo(N, InsV); } + // vector_shuffle or + if (isUndefInRange(SVOp->getMask(), HalfNumElems, HalfNumElems)) { + // If the shuffle only uses the lower halves of the inputs, + // then extract them and perform the 'half' shuffle. + bool AllLowerHalf = true; + SmallVector HalfMask; + for (unsigned i = 0; i != HalfNumElems; ++i) { + int M = SVOp->getMaskElt(i); + if (M < 0) { + HalfMask.push_back(M); + continue; + } + AllLowerHalf &= (M % NumElems) < HalfNumElems; + if (M >= (int)NumElems) { + HalfMask.push_back((M % NumElems) + HalfNumElems); + continue; + } + HalfMask.push_back(M); + } + + if (AllLowerHalf) { + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElems); + SDValue Half1 = Extract128BitVector(V1, 0, DAG, dl); + SDValue Half2 = Extract128BitVector(V2, 0, DAG, dl); + SDValue V = DAG.getVectorShuffle(HalfVT, dl, Half1, Half2, HalfMask); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); + return DCI.CombineTo(N, InsV); + } + } + return SDValue(); } @@ -26349,7 +26381,7 @@ // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. - // FIXME: Check rounding control flags as well once it becomes available. + // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3284,6 +3284,15 @@ ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2013,6 +2013,15 @@ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; ALL-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -491,7 +491,7 @@ define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_11uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -1135,20 +1135,10 @@ } define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_11uu: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_11uu: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i64_11uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4i64_11uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -864,6 +864,15 @@ ret <8 x float> %shuffle } +define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1111uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_5555uuuu: ; AVX1: # BB#0: @@ -1961,6 +1970,15 @@ ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2222uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_44444444: ; AVX1: # BB#0: