diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21299,10 +21299,11 @@ } } - // See if we can replace a shuffle with an insert_subvector. + // See if we can replace a shuffle with an insert_subvector sequence. // e.g. v2i32 into v8i32: - // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7). - // --> insert_subvector(lhs,rhs1,4). + // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),12,13,2,3,10,11,6,7). + // --> + // insert_subvector(insert_subvector(lhs, rhs2, 0), rhs1, 4). if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) && TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) { auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef Mask) { @@ -21315,41 +21316,56 @@ if (!TLI.isTypeLegal(SubVT)) return SDValue(); - // Don't bother if we have an unary shuffle (matches undef + LHS elts). - if (all_of(Mask, [NumElts](int M) { return M < (int)NumElts; })) - return SDValue(); + SmallVector, 4> InsertSubvecSequence; // Search [NumSubElts] spans for RHS sequence. // TODO: Can we avoid nested loops to increase performance? - SmallVector InsertionMask(NumElts); for (int SubVec = 0; SubVec != NumSubVecs; ++SubVec) { for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) { - // Reset mask to identity. - std::iota(InsertionMask.begin(), InsertionMask.end(), 0); + ArrayRef ActualSubmask = + Mask.drop_front(SubIdx).take_front(NumSubElts); + auto IdentitySubmask = seq(SubIdx, SubIdx + NumSubElts); + auto ExpectedSubmask = + seq(NumElts + (SubVec * NumSubElts), + NumElts + (SubVec * NumSubElts) + NumSubElts); + assert(ActualSubmask.size() == IdentitySubmask.size() && + ActualSubmask.size() == ExpectedSubmask.size() && + ActualSubmask.size() == (unsigned)NumSubElts && + "Miscalculated mask subsamples?"); - // Add subvector insertion. - std::iota(InsertionMask.begin() + SubIdx, - InsertionMask.begin() + SubIdx + NumSubElts, - NumElts + (SubVec * NumSubElts)); + // If all of the actual shuffle mask elements in this subsection + // are undef/identity then skip this subsection - keep LHS elements. + if (all_of(zip(ActualSubmask, IdentitySubmask), [](auto I) { + int ActualIdx, IdentityIdx; + std::tie(ActualIdx, IdentityIdx) = I; + return ActualIdx < 0 || ActualIdx == IdentityIdx; + })) + continue; - // See if the shuffle mask matches the reference insertion mask. - bool MatchingShuffle = true; - for (int i = 0; i != (int)NumElts; ++i) { - int ExpectIdx = InsertionMask[i]; - int ActualIdx = Mask[i]; - if (0 <= ActualIdx && ExpectIdx != ActualIdx) { - MatchingShuffle = false; - break; - } - } + // Does the shuffle insert subvector \p SubVec at position \p SubIdx? + // If not, then the shuffle will stay, and we have to abort. + if (!all_of(zip(ActualSubmask, ExpectedSubmask), [](auto I) { + int ActualIdx, ExpectIdx; + std::tie(ActualIdx, ExpectIdx) = I; + return ActualIdx < 0 || ActualIdx == ExpectIdx; + })) + return SDValue(); - if (MatchingShuffle) - return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS, - RHS.getOperand(SubVec), - DAG.getVectorIdxConstant(SubIdx, SDLoc(N))); + InsertSubvecSequence.emplace_back(RHS.getOperand(SubVec), SubIdx); } } - return SDValue(); + assert( + !InsertSubvecSequence.empty() && + "Did not discover a sequence of insertions yet didn't early-return?"); + + SDLoc DL(N); + SDValue Res = LHS; + for (std::pair Step : + InsertSubvecSequence) + Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Res, Step.first, + DAG.getVectorIdxConstant(Step.second, DL)); + + return Res; }; ArrayRef Mask = SVN->getMask(); if (N1.getOpcode() == ISD::CONCAT_VECTORS) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1794,7 +1794,7 @@ ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <2 x i64> %x, i32 0 diff --git a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll --- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vhaddps %ymm4, %ymm0, %ymm0 ; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -87,7 +87,7 @@ ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vblendps $15, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x0f] +; CHECK-NEXT: vblendps $240, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xf0] ; CHECK-NEXT: # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -695,9 +695,11 @@ ; ALL-LABEL: PR50053: ; ALL: # %bb.0: ; ALL-NEXT: vmovaps (%rsi), %ymm0 -; ALL-NEXT: vinsertf128 $1, 32(%rsi), %ymm0, %ymm1 -; ALL-NEXT: vinsertf128 $0, 48(%rsi), %ymm0, %ymm0 +; ALL-NEXT: vmovaps 32(%rsi), %xmm1 +; ALL-NEXT: vmovaps 48(%rsi), %xmm2 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] ; ALL-NEXT: vmovaps %ymm1, (%rdi) +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; ALL-NEXT: vmovaps %ymm0, 32(%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -14,35 +14,35 @@ ; CHECK-NEXT: vmovaps %ymm4, %ymm10 ; CHECK-NEXT: vmovaps %ymm3, %ymm9 ; CHECK-NEXT: vmovaps %ymm1, %ymm8 -; CHECK-NEXT: vmovaps %ymm0, %ymm4 +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm1 -; CHECK-NEXT: vmovaps 208(%rbp), %ymm3 +; CHECK-NEXT: vmovaps 208(%rbp), %ymm4 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 144(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 112(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm3, %xmm6 +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vmovaps %xmm4, %xmm6 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 ; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vmovaps %xmm6, %xmm2 -; CHECK-NEXT: # kill: def $xmm4 killed $xmm4 killed $ymm4 -; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; CHECK-NEXT: vmovaps %xmm7, %xmm4 -; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; CHECK-NEXT: # implicit-def: $ymm4 -; CHECK-NEXT: vmovaps %xmm6, %xmm4 -; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; CHECK-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3 +; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vmovaps %xmm7, %xmm3 +; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm3 +; CHECK-NEXT: vmovaps %xmm6, %xmm3 +; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -563,7 +563,9 @@ define <16 x float> @insert_sub1_12(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub1_12: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $3, %xmm2, %zmm0, %zmm0 +; ALL-NEXT: vinsertf32x4 $1, %xmm2, %zmm0, %zmm1 +; ALL-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,2,3,4,5,10,11] +; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> @@ -589,8 +591,8 @@ define <16 x float> @insert_sub01_8(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub01_8: ; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; ALL-NEXT: vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> @@ -605,7 +607,8 @@ ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm3 killed $xmm3 def $ymm3 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm1 -; ALL-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm1 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32>