diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8992,14 +8992,18 @@ bool isConstant = true; bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; + unsigned NumDifferentLanes = 0; + unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) AllLanesExtractElt = false; - if (V.isUndef()) + if (V.isUndef()) { + ++NumUndefLanes; continue; + } if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) @@ -9015,8 +9019,10 @@ if (!Value.getNode()) Value = V; - else if (V != Value) + else if (V != Value) { usesOnlyOneValue = false; + ++NumDifferentLanes; + } } if (!Value.getNode()) { @@ -9142,11 +9148,20 @@ } } + // If we need to insert a small number of different non-constant elements and + // the vector width is sufficiently large, prefer using DUP with the common + // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, + // skip the constant lane handling below. + bool PreferDUPAndInsert = + !isConstant && NumDifferentLanes >= 1 && + NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && NumElts >= 4 && + NumDifferentLanes >= NumConstantLanes; + // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. - if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { + if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { // Firstly, try to materialize the splat constant. SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), Val = ConstantBuildVector(Vec, DAG); @@ -9175,13 +9190,29 @@ "expansion\n"); return SDValue(); } - + // // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } + if (PreferDUPAndInsert) { + // First, build a constant vector with the common element. + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(Value); + SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); + // Next, insert the elements that do not match the common value. + for (unsigned i = 0; i < NumElts; ++i) + if (Op.getOperand(i) != Value) + NewVector = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, + Op.getOperand(i), DAG.getConstant(i, dl, MVT::i64)); + + return NewVector; + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -34,12 +34,10 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) { ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector: ; CHECK: bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: // kill -; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: mov.s v1[1], v0[0] -; CHECK-NEXT: mov.s v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret ; entry: