Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12236,6 +12236,9 @@ unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; + SmallMapVector DifferentValueMap; + unsigned ConsecutiveValCount = 0; + SDValue PrevVal; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) @@ -12263,6 +12266,13 @@ usesOnlyOneValue = false; ++NumDifferentLanes; } + + if (PrevVal != V) { + ConsecutiveValCount = 0; + PrevVal = V; + } + + DifferentValueMap[V.getNode()] = ++ConsecutiveValCount; } if (!Value.getNode()) { @@ -12454,6 +12464,48 @@ return Shuffle; } + // If vector has two different values and it can be splitted into two sub + // vectors with same length, generate two DUP and CONCAT_VECTORS with them. + // For example, + // + // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23, + // t24, t24, t24, t24, t24, t24, t24, t24 + // ==> + // t26: v8i8 = AArch64ISD::DUP t23 + // t28: v8i8 = AArch64ISD::DUP t24 + // t29: v16i8 = concat_vectors t26, t28 + bool canBeSplitWithTwoDUP = true; + + // Check there are two different values in the vector. + if (DifferentValueMap.size() != 2) + canBeSplitWithTwoDUP = false; + + // Check the consecutive count of the value is the half number of vector + // elements. + for (auto Pair : DifferentValueMap) { + if (Pair.second != NumElts / 2) { + canBeSplitWithTwoDUP = false; + break; + } + } + + if (canBeSplitWithTwoDUP) { + EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (isTypeLegal(SubVT) && SubVT.isVector() && + SubVT.getVectorNumElements() > 1) { + unsigned SubNumElts = NumElts / 2; + SmallVector Ops1(SubNumElts, Op.getOperand(0)); + SmallVector Ops2(SubNumElts, Op.getOperand(SubNumElts)); + SDValue SubVector1 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG); + SDValue SubVector2 = + LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG); + SDValue ConcatVector = + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SubVector1, SubVector2); + return ConcatVector; + } + } + if (PreferDUPAndInsert) { // First, build a constant vector with the common element. SmallVector Ops(NumElts, Value); Index: llvm/test/CodeGen/AArch64/arm64-tbl.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -160,7 +160,7 @@ ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: mov.b v4[1], w0 @@ -173,19 +173,19 @@ ; CHECK-NEXT: mov.b v4[6], w0 ; CHECK-NEXT: mov.b v4[7], w0 ; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov w8, #36 +; CHECK-NEXT: mov w8, #36 // =0x24 ; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov w8, #40 +; CHECK-NEXT: mov w8, #40 // =0x28 ; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov w8, #44 +; CHECK-NEXT: mov w8, #44 // =0x2c ; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov w8, #48 +; CHECK-NEXT: mov w8, #48 // =0x30 ; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov w8, #52 +; CHECK-NEXT: mov w8, #52 // =0x34 ; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: mov w8, #56 +; CHECK-NEXT: mov w8, #56 // =0x38 ; CHECK-NEXT: mov.b v4[14], w8 -; CHECK-NEXT: mov w8, #60 +; CHECK-NEXT: mov w8, #60 // =0x3c ; CHECK-NEXT: mov.b v4[15], w8 ; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret @@ -214,7 +214,7 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 @@ -226,22 +226,22 @@ ; CHECK-NEXT: mov.b v4[4], w8 ; CHECK-NEXT: mov.b v4[5], w8 ; CHECK-NEXT: mov.b v4[6], w8 -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: mov.b v4[7], w0 ; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov w8, #36 +; CHECK-NEXT: mov w8, #36 // =0x24 ; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov w8, #40 +; CHECK-NEXT: mov w8, #40 // =0x28 ; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov w8, #44 +; CHECK-NEXT: mov w8, #44 // =0x2c ; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov w8, #48 +; CHECK-NEXT: mov w8, #48 // =0x30 ; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov w8, #52 +; CHECK-NEXT: mov w8, #52 // =0x34 ; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: mov w8, #56 +; CHECK-NEXT: mov w8, #56 // =0x38 ; CHECK-NEXT: mov.b v4[14], w8 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: mov.b v4[15], w8 ; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret @@ -315,23 +315,18 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: dup.16b v4, w0 -; CHECK-NEXT: adrp x9, .LCPI13_0 +; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: dup.8b v5, w0 ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI13_0] -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov.b v4[13], w8 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 -; CHECK-NEXT: tbl.16b v3, { v0, v1 }, v4 +; CHECK-NEXT: mov.d v5[1], v4[0] +; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v6 +; CHECK-NEXT: tbl.16b v3, { v0, v1 }, v5 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: tbl.16b v0, { v2, v3 }, v0 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/build-vector-two-dup.ll =================================================================== --- llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -4,24 +4,9 @@ define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -75,24 +60,9 @@ define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ld1r { v1.8b }, [x0] +; CHECK-NEXT: ld1r { v0.8b }, [x1] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1