Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12236,6 +12236,10 @@ unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; + SmallMapVector DifferentValueMap; + unsigned ConsecutiveValCount = 0; + SmallVector MaskVec; + SDValue PrevVal; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) @@ -12263,6 +12267,20 @@ usesOnlyOneValue = false; ++NumDifferentLanes; } + + // The vector mask will be valid only if the DifferentValueSet's size is 2. + if (V == Value) + MaskVec.push_back(0); + else + MaskVec.push_back(NumElts); + + if (PrevVal != V) { + ConsecutiveValCount = 0; + PrevVal = V; + } + + // Keep different values and its consecutive count. + DifferentValueMap[V] = ++ConsecutiveValCount; } if (!Value.getNode()) { @@ -12468,6 +12486,46 @@ return NewVector; } + // If vector consists of two different values, try to generate two DUPs and + // (CONCAT_VECTORS or VECTOR_SHUFFLE). + if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) { + SmallVector Vals; + // Check the consecutive count of the value is the half number of vector + // elements. In this case, we can use CONCAT_VECTORS. + bool canUseVECTOR_CONCAT = true; + for (auto Pair : DifferentValueMap) { + if (Pair.second != NumElts / 2) + canUseVECTOR_CONCAT = false; + Vals.push_back(Pair.first); + } + + if (canUseVECTOR_CONCAT) { + EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (isTypeLegal(SubVT) && SubVT.isVector() && + SubVT.getVectorNumElements() >= 2) { + SmallVector Ops1(NumElts / 2, Vals[0]); + SmallVector Ops2(NumElts / 2, Vals[1]); + SDValue DUP1 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG); + SDValue DUP2 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG); + SDValue CONCAT_VECTORS = + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2); + return CONCAT_VECTORS; + } + } + + if (NumElts >= 8) { + SmallVector Ops1(NumElts, Vals[0]); + SmallVector Ops2(NumElts, Vals[1]); + SDValue DUP1 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops1), DAG); + SDValue DUP2 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops2), DAG); + SDValue VECTOR_SHUFFLE = + DAG.getVectorShuffle(VT, dl, DUP1, DUP2, MaskVec); + return VECTOR_SHUFFLE; + } + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's Index: llvm/test/CodeGen/AArch64/build-vector-two-dup.ll =================================================================== --- llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -4,24 +4,9 @@ define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -75,24 +60,9 @@ define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x0] +; CHECK-NEXT: ld1r { v0.8b }, [x1] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -128,17 +98,12 @@ define <8 x i8> @test6(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ld1r { v0.8b }, [x1] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ld1r { v1.8b }, [x0] +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -154,17 +119,12 @@ define <8 x i8> @test7(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -180,16 +140,9 @@ define <8 x i16> @test8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: ldrh w8, [x1] -; CHECK-NEXT: mov v0.h[4], w8 -; CHECK-NEXT: mov v0.h[5], w8 -; CHECK-NEXT: mov v0.h[6], w8 -; CHECK-NEXT: mov v0.h[7], w8 +; CHECK-NEXT: ld1r { v1.4h }, [x1] +; CHECK-NEXT: ld1r { v0.4h }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i16, ptr %a, align 1 @@ -205,12 +158,9 @@ define <4 x i32> @test9(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test9: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: ldr w8, [x1] -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ld1r { v1.2s }, [x1] +; CHECK-NEXT: ld1r { v0.2s }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i32, ptr %a, align 1