Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12236,6 +12236,10 @@ unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; + SmallMapVector DifferentValueMap; + unsigned ConsecutiveValCount = 0; + SmallVector MaskVec; + SDValue PrevVal; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) @@ -12263,6 +12267,20 @@ usesOnlyOneValue = false; ++NumDifferentLanes; } + + // The vector mask will be valid only if the DifferentValueSet's size is 2. + if (V == Value) + MaskVec.push_back(0); + else + MaskVec.push_back(NumElts); + + if (PrevVal != V) { + ConsecutiveValCount = 0; + PrevVal = V; + } + + // Keep different values and its consecutive count. + DifferentValueMap[V] = ++ConsecutiveValCount; } if (!Value.getNode()) { @@ -12468,6 +12486,43 @@ return NewVector; } + // If vector consists of two different values, try to generate two DUPs and + // (CONCAT_VECTORS or VECTOR_SHUFFLE). + if (DifferentValueMap.size() == 2 && NumUndefLanes == 0 && NumElts >= 8) { + SmallVector Vals; + // Check the consecutive count of the value is the half number of vector + // elements. In this case, we can use CONCAT_VECTORS. + bool canUseVECTOR_CONCAT = true; + for (auto Pair : DifferentValueMap) { + if (Pair.second != NumElts / 2) + canUseVECTOR_CONCAT = false; + Vals.push_back(Pair.first); + } + + if (canUseVECTOR_CONCAT) { + EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (isTypeLegal(SubVT) && SubVT.isVector() && + SubVT.getVectorNumElements() >= 4) { + SmallVector Ops1(NumElts / 2, Vals[0]); + SmallVector Ops2(NumElts / 2, Vals[1]); + SDValue DUP1 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG); + SDValue DUP2 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG); + SDValue CONCAT_VECTORS = + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2); + return CONCAT_VECTORS; + } + } + + SmallVector Ops1(NumElts, Vals[0]); + SmallVector Ops2(NumElts, Vals[1]); + SDValue DUP1 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops1), DAG); + SDValue DUP2 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops2), DAG); + SDValue VECTOR_SHUFFLE = DAG.getVectorShuffle(VT, dl, DUP1, DUP2, MaskVec); + return VECTOR_SHUFFLE; + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's Index: llvm/test/CodeGen/AArch64/build-vector-two-dup.ll =================================================================== --- llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -4,24 +4,9 @@ define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -75,24 +60,9 @@ define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x0] +; CHECK-NEXT: ld1r { v0.8b }, [x1] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1