diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21290,6 +21290,76 @@ } } + // See if we can replace a shuffle with an insert_subvector. + // e.g. v2i32 into v8i32: + // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7). + // --> insert_subvector(lhs,rhs1,4). + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) && + TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) { + auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef Mask) { + // Ensure RHS subvectors are legal. + assert(RHS.getOpcode() == ISD::CONCAT_VECTORS && "Can't find subvectors"); + EVT SubVT = RHS.getOperand(0).getValueType(); + int NumSubElts = SubVT.getVectorNumElements(); + assert((NumElts % NumSubElts) == 0 && "Subvector mismatch"); + if (!TLI.isTypeLegal(SubVT)) + return SDValue(); + + // Search [NumSubElts] spans for RHS sequence. + for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) { + // Ensure LHS is used outside of the span and its sequential (identity). + bool LHSIsIdentity = true; + for (int i = 0; i < SubIdx; ++i) + if (0 <= Mask[i] && Mask[i] != i) + LHSIsIdentity = false; + for (int i = SubIdx + NumSubElts; i < (int)NumElts; ++i) + if (0 <= Mask[i] && Mask[i] != i) + LHSIsIdentity = false; + if (!LHSIsIdentity) + continue; + + // Look through span, ensure it only references RHS from one subvector. + int RHSStartIdx = -1; + for (int i = 0; i != NumSubElts; ++i) { + int Idx = Mask[SubIdx + i]; + if (Idx < 0) + continue; + if (Idx < (int)NumElts) + break; + + // Find RHS starting index - must be start of a subvector. + int RHSIdx = Idx - NumElts; + if (RHSStartIdx < 0) { + RHSStartIdx = RHSIdx - i; + if ((RHSStartIdx % NumSubElts) != 0) + break; + } + + // Check we are in the same RHS sequence. + if (RHSIdx != (RHSStartIdx + i)) + break; + + // We've found an entire subvector - insert directly. + if (i == (NumSubElts - 1)) + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS, + RHS.getOperand(RHSStartIdx / NumSubElts), + DAG.getVectorIdxConstant(SubIdx, SDLoc(N))); + } + } + return SDValue(); + }; + ArrayRef Mask = SVN->getMask(); + if (N1.getOpcode() == ISD::CONCAT_VECTORS) + if (SDValue InsertN1 = ShuffleToInsert(N0, N1, Mask)) + return InsertN1; + if (N0.getOpcode() == ISD::CONCAT_VECTORS) { + SmallVector CommuteMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommuteMask); + if (SDValue InsertN0 = ShuffleToInsert(N1, N0, CommuteMask)) + return InsertN0; + } + } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7849,9 +7849,9 @@ def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))), (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; -// A 64-bit subvector insert to the first 128-bit vector position -// is a subregister copy that needs no instruction. -multiclass InsertSubvectorUndef { +multiclass InsertSubvectorPatterns { + // A 64-bit subvector insert to the first 128-bit vector position + // is a subregister copy that needs no instruction. def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (Ty 0)), @@ -7868,10 +7868,44 @@ (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; -} - -defm : InsertSubvectorUndef; -defm : InsertSubvectorUndef; + // Insert 64-bit subvector into lower half of 128-bit vector. + def : Pat<(insert_subvector (v2i64 FPR128:$vec), (v1i64 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v2f64 FPR128:$vec), (v1f64 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v4i32 FPR128:$vec), (v2i32 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v4f32 FPR128:$vec), (v2f32 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v8i16 FPR128:$vec), (v4i16 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v8f16 FPR128:$vec), (v4f16 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v8bf16 FPR128:$vec), (v4bf16 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + def : Pat<(insert_subvector (v16i8 FPR128:$vec), (v8i8 FPR64:$sub), (Ty 0)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 1, V128:$vec, 0)>; + // Insert 64-bit subvector into upper half of 128-bit vector. + def : Pat<(insert_subvector (v2i64 FPR128:$vec), (v1i64 FPR64:$sub), (Ty 1)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v2f64 FPR128:$vec), (v1f64 FPR64:$sub), (Ty 1)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v4i32 FPR128:$vec), (v2i32 FPR64:$sub), (Ty 2)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v4f32 FPR128:$vec), (v2f32 FPR64:$sub), (Ty 2)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v8i16 FPR128:$vec), (v4i16 FPR64:$sub), (Ty 4)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v8f16 FPR128:$vec), (v4f16 FPR64:$sub), (Ty 4)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v8bf16 FPR128:$vec), (v4bf16 FPR64:$sub), (Ty 4)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; + def : Pat<(insert_subvector (v16i8 FPR128:$vec), (v8i8 FPR64:$sub), (Ty 8)), + (INSvi64lane V128:$vec, 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$sub, dsub), 0)>; +} + +defm : InsertSubvectorPatterns; +defm : InsertSubvectorPatterns; // Use pair-wise add instructions when summing up the lanes for v2f64, v2i64 // or v2f32. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6214,14 +6214,21 @@ if (ISD::isBuildVectorAllZeros(Vec.getNode())) { assert(IdxVal != 0 && "Unexpected index"); - NumElems = WideOpVT.getVectorNumElements(); - unsigned ShiftLeft = NumElems - SubVecNumElems; - unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - if (ShiftRight != 0) - SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, - DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + // If upper elements of Vec are known undef, then just shift into place. + if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems), + [](SDValue V) { return V.isUndef(); })) { + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); + } else { + NumElems = WideOpVT.getVectorNumElements(); + unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); + if (ShiftRight != 0) + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + } return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1794,7 +1794,7 @@ ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %vecext = extractelement <2 x i64> %x, i32 0 diff --git a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll --- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vhaddps %ymm4, %ymm0, %ymm0 ; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -87,7 +87,7 @@ ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vblendps $240, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xf0] +; CHECK-NEXT: vblendps $15, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x0f] ; CHECK-NEXT: # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -695,11 +695,9 @@ ; ALL-LABEL: PR50053: ; ALL: # %bb.0: ; ALL-NEXT: vmovaps (%rsi), %ymm0 -; ALL-NEXT: vmovaps 32(%rsi), %xmm1 -; ALL-NEXT: vmovaps 48(%rsi), %xmm2 -; ALL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] +; ALL-NEXT: vinsertf128 $1, 32(%rsi), %ymm0, %ymm1 +; ALL-NEXT: vinsertf128 $0, 48(%rsi), %ymm0, %ymm0 ; ALL-NEXT: vmovaps %ymm1, (%rdi) -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; ALL-NEXT: vmovaps %ymm0, 32(%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -14,35 +14,35 @@ ; CHECK-NEXT: vmovaps %ymm4, %ymm10 ; CHECK-NEXT: vmovaps %ymm3, %ymm9 ; CHECK-NEXT: vmovaps %ymm1, %ymm8 -; CHECK-NEXT: vmovaps %ymm0, %ymm3 +; CHECK-NEXT: vmovaps %ymm0, %ymm4 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm1 -; CHECK-NEXT: vmovaps 208(%rbp), %ymm4 +; CHECK-NEXT: vmovaps 208(%rbp), %ymm3 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 144(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 112(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm4, %xmm6 +; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vmovaps %xmm3, %xmm6 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 ; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vmovaps %xmm6, %xmm2 -; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3 -; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; CHECK-NEXT: vmovaps %xmm7, %xmm3 -; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] -; CHECK-NEXT: # implicit-def: $ymm3 -; CHECK-NEXT: vmovaps %xmm6, %xmm3 -; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; CHECK-NEXT: # kill: def $xmm4 killed $xmm4 killed $ymm4 +; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; CHECK-NEXT: vmovaps %xmm7, %xmm4 +; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm4 +; CHECK-NEXT: vmovaps %xmm6, %xmm4 +; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; CHECK-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5]