diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12353,6 +12353,9 @@ unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; + SmallMapVector DifferentValueMap; + unsigned ConsecutiveValCount = 0; + SDValue PrevVal; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) @@ -12380,6 +12383,24 @@ usesOnlyOneValue = false; ++NumDifferentLanes; } + + if (PrevVal != V) { + ConsecutiveValCount = 0; + PrevVal = V; + } + + // Keep different values and its last consecutive count. For example, + // + // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23, + // t24, t24, t24, t24, t24, t24, t24, t24 + // t23 = consecutive count 8 + // t24 = consecutive count 8 + // ------------------------------------------------------------------ + // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24, + // t24, t24, t24, t24, t24, t24, t24, t24 + // t23 = consecutive count 5 + // t24 = consecutive count 9 + DifferentValueMap[V] = ++ConsecutiveValCount; } if (!Value.getNode()) { @@ -12585,6 +12606,82 @@ return NewVector; } + // If vector consists of two different values, try to generate two DUPs and + // (CONCAT_VECTORS or VECTOR_SHUFFLE). + if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) { + SmallVector Vals; + // Check the consecutive count of the value is the half number of vector + // elements. In this case, we can use CONCAT_VECTORS. For example, + // + // canUseVECTOR_CONCAT = true; + // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23, + // t24, t24, t24, t24, t24, t24, t24, t24 + // + // canUseVECTOR_CONCAT = false; + // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24, + // t24, t24, t24, t24, t24, t24, t24, t24 + bool canUseVECTOR_CONCAT = true; + for (auto Pair : DifferentValueMap) { + // Check different values have same length which is NumElts / 2. + if (Pair.second != NumElts / 2) + canUseVECTOR_CONCAT = false; + Vals.push_back(Pair.first); + } + + // If canUseVECTOR_CONCAT is true, we can generate two DUPs and + // CONCAT_VECTORs. For example, + // + // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23, + // t24, t24, t24, t24, t24, t24, t24, t24 + // ==> + // t26: v8i8 = AArch64ISD::DUP t23 + // t28: v8i8 = AArch64ISD::DUP t24 + // t29: v16i8 = concat_vectors t26, t28 + if (canUseVECTOR_CONCAT) { + EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (isTypeLegal(SubVT) && SubVT.isVector() && + SubVT.getVectorNumElements() >= 2) { + SmallVector Ops1(NumElts / 2, Vals[0]); + SmallVector Ops2(NumElts / 2, Vals[1]); + SDValue DUP1 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG); + SDValue DUP2 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG); + SDValue CONCAT_VECTORS = + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2); + return CONCAT_VECTORS; + } + } + + // Let's try to generate two DUPs and VECTOR_SHUFFLE. For example, + // + // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26 + // ==> + // t28: v8i8 = AArch64ISD::DUP t25 + // t30: v8i8 = AArch64ISD::DUP t26 + // t31: v8i8 = vector_shuffle<0,0,0,0,8,8,8,8> t28, t30 + if (NumElts >= 8) { + SmallVector MaskVec; + // Build mask for VECTOR_SHUFLLE. + SDValue FirstLaneVal = Op.getOperand(0); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Val = Op.getOperand(i); + if (FirstLaneVal == Val) + MaskVec.push_back(0); + else + MaskVec.push_back(NumElts); + } + + SmallVector Ops1(NumElts, Vals[0]); + SmallVector Ops2(NumElts, Vals[1]); + SDValue DUP1 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops1), DAG); + SDValue DUP2 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops2), DAG); + SDValue VECTOR_SHUFFLE = + DAG.getVectorShuffle(VT, dl, DUP1, DUP2, MaskVec); + return VECTOR_SHUFFLE; + } + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -4,24 +4,9 @@ define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -75,24 +60,9 @@ define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x0] +; CHECK-NEXT: ld1r { v0.8b }, [x1] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -128,17 +98,12 @@ define <8 x i8> @test6(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ld1r { v0.8b }, [x1] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ld1r { v1.8b }, [x0] +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -154,17 +119,12 @@ define <8 x i8> @test7(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -180,16 +140,9 @@ define <8 x i16> @test8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: ldrh w8, [x1] -; CHECK-NEXT: mov v0.h[4], w8 -; CHECK-NEXT: mov v0.h[5], w8 -; CHECK-NEXT: mov v0.h[6], w8 -; CHECK-NEXT: mov v0.h[7], w8 +; CHECK-NEXT: ld1r { v1.4h }, [x1] +; CHECK-NEXT: ld1r { v0.4h }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i16, ptr %a, align 1 @@ -205,12 +158,9 @@ define <4 x i32> @test9(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test9: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: ldr w8, [x1] -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ld1r { v1.2s }, [x1] +; CHECK-NEXT: ld1r { v0.2s }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i32, ptr %a, align 1