Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12202,12 +12202,81 @@ DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Ops)); } -SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { - // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of - // EXTRACT_SUBVECTOR operations. If so, and if the EXTRACT_SUBVECTOR vector - // inputs come from at most two distinct vectors, turn this into a shuffle - // node. +// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR +// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at +// most two distinct vectors the same size as the result, attempt to turn this +// into a legal shuffle. +static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + EVT OpVT = N->getOperand(0).getValueType(); + int NumElts = VT.getVectorNumElements(); + int NumOpElts = OpVT.getVectorNumElements(); + + SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); + SmallVector Mask; + + for (SDValue Op : N->ops()) { + // Peek through any bitcast. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + + // UNDEF nodes convert to UNDEF shuffle mask values. + if (Op.getOpcode() == ISD::UNDEF) { + Mask.append((unsigned)NumOpElts, -1); + continue; + } + + if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + + // What vector are we extracting the subvector from and at what index? + SDValue ExtVec = Op.getOperand(0); + if (ExtVec.getOpcode() == ISD::UNDEF) { + Mask.append((unsigned)NumOpElts, -1); + continue;; + } + + EVT ExtVT = ExtVec.getValueType(); + if (!isa(Op.getOperand(1))) + return SDValue(); + int ExtIdx = dyn_cast(Op.getOperand(1))->getZExtValue(); + // Ensure that we are extracting a subvector from a vector the same + // size as the result. + if (ExtVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + + // Scale the subvector index to account for any bitcast. + int NumExtElts = ExtVT.getVectorNumElements(); + if (0 == (NumExtElts % NumElts)) + ExtIdx /= (NumExtElts / NumElts); + else if (0 == (NumElts % NumExtElts)) + ExtIdx *= (NumElts / NumExtElts); + else + return SDValue(); + + // At most we can reference 2 inputs in the final shuffle. + if (SV0.getOpcode() == ISD::UNDEF || SV0 == ExtVec) { + SV0 = ExtVec; + for (int i = 0; i != NumOpElts; ++i) + Mask.push_back(i + ExtIdx); + } else if (SV1.getOpcode() == ISD::UNDEF || SV1 == ExtVec) { + SV1 = ExtVec; + for (int i = 0; i != NumOpElts; ++i) + Mask.push_back(i + ExtIdx + NumElts); + } else { + return SDValue(); + } + } + + if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT)) + return SDValue(); + + return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), + DAG.getBitcast(VT, SV1), Mask); +} + +SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) return N->getOperand(0); @@ -12308,6 +12377,11 @@ if (SDValue V = combineConcatVectorOfScalars(N, DAG)) return V; + // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. + if (Level < AfterLegalizeVectorOps) + if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) + return V; + // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that Index: test/CodeGen/X86/pr15267.ll =================================================================== --- test/CodeGen/X86/pr15267.ll +++ test/CodeGen/X86/pr15267.ll @@ -65,11 +65,10 @@ ; CHECK-NEXT: shlq $60, %rax ; CHECK-NEXT: sarq $63, %rax ; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 -; CHECK-NEXT: vmovd %ecx, %xmm1 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpmovsxdq %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %wide.load35 = load <4 x i1>, <4 x i1>* %in, align 1 %sext = sext <4 x i1> %wide.load35 to <4 x i64> Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3262,11 +3262,15 @@ } define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq %alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32> @@ -3276,9 +3280,7 @@ define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: ; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; ALL-NEXT: retq %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -974,11 +974,15 @@ } define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) { -; ALL-LABEL: concat_v4i64_0167: -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: concat_v4i64_0167: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v4i64_0167: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2099,11 +2099,15 @@ } define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: concat_v8i32_0123CDEF: -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: concat_v8i32_0123CDEF: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v8i32_0123CDEF: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq %alo = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> %bhi = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32> %shuf = shufflevector <4 x i32> %alo, <4 x i32> %bhi, <8 x i32> @@ -2113,9 +2117,7 @@ define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { ; ALL-LABEL: concat_v8i32_4567CDEF_bc: ; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; ALL-NEXT: retq %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32>