Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17599,6 +17599,53 @@ return SDValue(); } +/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles +/// followed by concatenation. Narrow vector ops may have better performance +/// than wide ops, and this can unlock further narrowing of other vector ops. +/// Targets can invert this transform later if it is not profitable. +static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1); + if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || + N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 || + !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) + return SDValue(); + + // Split the wide shuffle mask into halves. Any mask element that is accessing + // operand 1 is offset down to account for narrowing of the vectors. + ArrayRef Mask = Shuf->getMask(); + EVT VT = Shuf->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + SmallVector Mask0(HalfNumElts, -1); + SmallVector Mask1(HalfNumElts, -1); + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] == -1) + continue; + int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts; + if (i < HalfNumElts) + Mask0[i] = M; + else + Mask1[i - HalfNumElts] = M; + } + + // Ask the target if this is a valid transform. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + HalfNumElts); + if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) || + !TLI.isShuffleMaskLegal(Mask1, HalfVT)) + return SDValue(); + + // shuffle (concat X, undef), (concat Y, undef), Mask --> + // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) + SDValue X = N0.getOperand(0), Y = N1.getOperand(0); + SDLoc DL(Shuf); + SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0); + SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1); +} + // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, // or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { @@ -18380,6 +18427,9 @@ return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask); } + if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG)) + return V; + return SDValue(); } Index: llvm/test/CodeGen/ARM/vuzp.ll =================================================================== --- llvm/test/CodeGen/ARM/vuzp.ll +++ llvm/test/CodeGen/ARM/vuzp.ll @@ -270,11 +270,10 @@ ; CHECK-LABEL: vuzp_lower_shufflemask_undef: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vorr q9, q8, q8 -; CHECK-NEXT: vuzp.16 q8, q9 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vuzp.16 d18, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr entry: %tmp1 = load <4 x i16>, <4 x i16>* %A @@ -286,13 +285,13 @@ define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vorr d19, d18, d18 ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 q9, d16[0] -; CHECK-NEXT: vuzp.32 q8, q9 -; CHECK-NEXT: vext.32 q8, q9, q9, #2 -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vtrn.32 d19, d17 +; CHECK-NEXT: vdup.32 d16, d18[0] ; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr entry: %tmp1 = load <2 x i32>, <2 x i32>* %A @@ -304,11 +303,10 @@ define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vrev64.32 q9, q8 -; CHECK-NEXT: vuzp.32 q8, q9 -; CHECK-NEXT: vst1.64 {d18, d19}, [r2] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vst1.64 {d16, d17}, [r2] ; CHECK-NEXT: mov pc, lr entry: %tmp1 = load <2 x i32>, <2 x i32>* %A Index: llvm/test/CodeGen/ARM/vzip.ll =================================================================== --- llvm/test/CodeGen/ARM/vzip.ll +++ llvm/test/CodeGen/ARM/vzip.ll @@ -270,8 +270,8 @@ ; CHECK-LABEL: vzip_lower_shufflemask_undef: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.16 d16, d17 +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vzip.16 d18, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr Index: llvm/test/CodeGen/X86/mulvi32.ll =================================================================== --- llvm/test/CodeGen/X86/mulvi32.ll +++ llvm/test/CodeGen/X86/mulvi32.ll @@ -229,8 +229,9 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm0[1] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> %even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> Index: llvm/test/CodeGen/X86/oddshuffles.ll =================================================================== --- llvm/test/CodeGen/X86/oddshuffles.ll +++ llvm/test/CodeGen/X86/oddshuffles.ll @@ -381,36 +381,23 @@ ; SSE42-NEXT: movdqa %xmm2, (%rdi) ; SSE42-NEXT: retq ; -; AVX1-LABEL: v7i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3] -; AVX1-NEXT: vmovss %xmm1, 24(%rdi) -; AVX1-NEXT: vmovlps %xmm0, 16(%rdi) -; AVX1-NEXT: vmovaps %xmm2, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: v7i32: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u> -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovss %xmm1, 24(%rdi) -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlps %xmm1, 16(%rdi) -; AVX2-NEXT: vmovaps %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: v7i32: +; AVX: # %bb.0: +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vmovss %xmm1, 24(%rdi) +; AVX-NEXT: vmovlps %xmm0, 16(%rdi) +; AVX-NEXT: vmovaps %xmm2, (%rdi) +; AVX-NEXT: retq ; ; XOP-LABEL: v7i32: ; XOP: # %bb.0: ; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] ; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] -; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3] +; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; XOP-NEXT: vmovss %xmm1, 24(%rdi) ; XOP-NEXT: vmovlps %xmm0, 16(%rdi) ; XOP-NEXT: vmovaps %xmm2, (%rdi) @@ -487,12 +474,12 @@ ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7] -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movq %xmm2, 16(%rdi) ; SSE2-NEXT: movdqa %xmm3, (%rdi) ; SSE2-NEXT: retq @@ -502,7 +489,7 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]