Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -219,6 +219,9 @@ // lanes VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes + // MVE VIDUP instruction, taking a start value and increment. + VIDUP, + // Vector multiply long: VMULLs, // ...signed VMULLu, // ...unsigned Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1730,6 +1730,7 @@ MAKE_CASE(ARMISD::VQMOVNu) MAKE_CASE(ARMISD::VCVTN) MAKE_CASE(ARMISD::VCVTL) + MAKE_CASE(ARMISD::VIDUP) MAKE_CASE(ARMISD::VMULLs) MAKE_CASE(ARMISD::VMULLu) MAKE_CASE(ARMISD::VQDMULH) @@ -7466,6 +7467,39 @@ return Base; } +static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + // We are looking for a buildvector where each element is Op[0] + i*N + EVT VT = Op.getValueType(); + SDValue Op0 = Op.getOperand(0); + unsigned NumElts = VT.getVectorNumElements(); + + // Get the increment value from operand 1 + SDValue Op1 = Op.getOperand(1); + if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 || + !isa(Op1.getOperand(1))) + return SDValue(); + unsigned N = Op1.getConstantOperandVal(1); + if (N != 1 && N != 2 && N != 4 && N != 8) + return SDValue(); + + // Check that each other operand matches + for (int I = 2; I < NumElts; I++) { + SDValue OpI = Op.getOperand(I); + if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 || + !isa(OpI.getOperand(1)) || + OpI.getConstantOperandVal(1) != I * N) + return SDValue(); + } + + SDLoc DL(Op); + return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0, + DAG.getConstant(N, DL, MVT::i32)); +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -7477,6 +7511,9 @@ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) return LowerBUILD_VECTOR_i1(Op, DAG, ST); + if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST)) + return R; + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; Index: llvm/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrInfo.td +++ llvm/lib/Target/ARM/ARMInstrInfo.td @@ -256,6 +256,10 @@ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i32>]>>; +def SDTARMVIDUP : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def ARMvidup : SDNode<"ARMISD::VIDUP", SDTARMVIDUP>; + def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5715,11 +5715,12 @@ defm MVE_VQRDMLASH_qr : MVE_VQDMLAH_qr_types<"vqrdmlash", 0b0, 0b1>; class MVE_VxDUP size, bit bit_12, - list pattern=[]> + ValueType VT, SDPatternOperator vxdup> : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn), (ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary, iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src", - pattern> { + [(set (VT MQPR:$Qd), (i32 tGPREven:$Rn), + (vxdup (i32 tGPREven:$Rn_src), (i32 imm:$imm)))]> { bits<4> Qd; bits<4> Rn; bits<2> imm; @@ -5740,13 +5741,13 @@ let hasSideEffects = 0; } -def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; -def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0>; -def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0>; +def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0, v16i8, ARMvidup>; +def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0, v8i16, ARMvidup>; +def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0, v4i32, ARMvidup>; -def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1>; -def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1>; -def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1>; +def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1, v16i8, null_frag>; +def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1, v8i16, null_frag>; +def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1, v4i32, null_frag>; class MVE_VxWDUP size, bit bit_12, list pattern=[]> Index: llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll +++ llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll @@ -14,55 +14,17 @@ ; CHECK-NEXT: it ls ; CHECK-NEXT: popls {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %while.body.preheader -; CHECK-NEXT: subs r0, r0, r1 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r2, r0, #15 -; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: bic r2, r2, #15 -; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: add.w r3, r3, r2, lsr #4 +; CHECK-NEXT: subs r4, r0, r1 ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: dlstp.8 lr, r4 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r3, r1, r2 -; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vmov.8 q0[0], r3 -; CHECK-NEXT: adds r4, r3, #1 -; CHECK-NEXT: vmov.8 q0[1], r4 -; CHECK-NEXT: adds r4, r3, #2 -; CHECK-NEXT: vmov.8 q0[2], r4 -; CHECK-NEXT: adds r4, r3, #3 -; CHECK-NEXT: vmov.8 q0[3], r4 -; CHECK-NEXT: adds r4, r3, #4 -; CHECK-NEXT: vmov.8 q0[4], r4 -; CHECK-NEXT: adds r4, r3, #5 -; CHECK-NEXT: vmov.8 q0[5], r4 -; CHECK-NEXT: adds r4, r3, #6 -; CHECK-NEXT: vmov.8 q0[6], r4 -; CHECK-NEXT: adds r4, r3, #7 -; CHECK-NEXT: vmov.8 q0[7], r4 -; CHECK-NEXT: add.w r4, r3, #8 -; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: add.w r4, r3, #9 -; CHECK-NEXT: vmov.8 q0[9], r4 -; CHECK-NEXT: add.w r4, r3, #10 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: add.w r4, r3, #11 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: add.w r4, r3, #12 -; CHECK-NEXT: vmov.8 q0[12], r4 -; CHECK-NEXT: add.w r4, r3, #13 -; CHECK-NEXT: vmov.8 q0[13], r4 -; CHECK-NEXT: add.w r4, r3, #14 +; CHECK-NEXT: adds r0, r1, r2 ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: vmov.8 q0[14], r4 -; CHECK-NEXT: adds r3, #15 -; CHECK-NEXT: vmov.8 q0[15], r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.8 q0, [r12], #16 -; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: vidup.u8 q0, r0, #1 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: letp lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r4, pc} entry: Index: llvm/test/CodeGen/Thumb2/mve-vidup.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vidup.ll +++ llvm/test/CodeGen/Thumb2/mve-vidup.ll @@ -4,11 +4,7 @@ define arm_aapcs_vfpcc <4 x i32> @vidup_v4i32_1(i32 %index) { ; CHECK-LABEL: vidup_v4i32_1: ; CHECK: @ %bb.0: -; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vidup.u32 q0, r0, #1 ; CHECK-NEXT: bx lr %a1 = add i32 %index, 1 %a2 = add i32 %index, 2 @@ -24,11 +20,7 @@ define arm_aapcs_vfpcc <4 x i32> @vidup_v4i32_2(i32 %index) { ; CHECK-LABEL: vidup_v4i32_2: ; CHECK: @ %bb.0: -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: adds r0, #2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vidup.u32 q0, r0, #2 ; CHECK-NEXT: bx lr %a1 = add i32 %index, 2 %a2 = add i32 %index, 4 @@ -64,11 +56,7 @@ define arm_aapcs_vfpcc <4 x i32> @vidup_v4i32_4(i32 %index) { ; CHECK-LABEL: vidup_v4i32_4: ; CHECK: @ %bb.0: -; CHECK-NEXT: add.w r1, r0, #8 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: add.w r1, r0, #12 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vidup.u32 q0, r0, #4 ; CHECK-NEXT: bx lr %a1 = add i32 %index, 4 %a2 = add i32 %index, 8 @@ -84,11 +72,7 @@ define arm_aapcs_vfpcc <4 x i32> @vidup_v4i32_8(i32 %index) { ; CHECK-LABEL: vidup_v4i32_8: ; CHECK: @ %bb.0: -; CHECK-NEXT: add.w r1, r0, #16 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: add.w r1, r0, #24 -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vidup.u32 q0, r0, #8 ; CHECK-NEXT: bx lr %a1 = add i32 %index, 8 %a2 = add i32 %index, 16 @@ -125,21 +109,7 @@ define arm_aapcs_vfpcc <8 x i16> @vidup_v8i16_1(i16 %index) { ; CHECK-LABEL: vidup_v8i16_1: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: adds r0, #7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vidup.u16 q0, r0, #1 ; CHECK-NEXT: bx lr %a1 = add i16 %index, 1 %a2 = add i16 %index, 2 @@ -163,21 +133,7 @@ define arm_aapcs_vfpcc <8 x i16> @vidup_v8i16_2(i16 %index) { ; CHECK-LABEL: vidup_v8i16_2: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: add.w r1, r0, #8 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: add.w r1, r0, #10 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: add.w r1, r0, #12 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: adds r0, #14 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vidup.u16 q0, r0, #2 ; CHECK-NEXT: bx lr %a1 = add i16 %index, 2 %a2 = add i16 %index, 4 @@ -201,21 +157,7 @@ define arm_aapcs_vfpcc <8 x i16> @vidup_v8i16_4(i16 %index) { ; CHECK-LABEL: vidup_v8i16_4: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: add.w r1, r0, #8 -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: add.w r1, r0, #12 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: add.w r1, r0, #16 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: add.w r1, r0, #20 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: add.w r1, r0, #24 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: adds r0, #28 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vidup.u16 q0, r0, #4 ; CHECK-NEXT: bx lr %a1 = add i16 %index, 4 %a2 = add i16 %index, 8 @@ -239,21 +181,7 @@ define arm_aapcs_vfpcc <8 x i16> @vidup_v8i16_8(i16 %index) { ; CHECK-LABEL: vidup_v8i16_8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: add.w r1, r0, #8 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: add.w r1, r0, #16 -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: add.w r1, r0, #24 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: add.w r1, r0, #32 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: add.w r1, r0, #40 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: add.w r1, r0, #48 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: adds r0, #56 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vidup.u16 q0, r0, #8 ; CHECK-NEXT: bx lr %a1 = add i16 %index, 8 %a2 = add i16 %index, 16 @@ -277,37 +205,7 @@ define arm_aapcs_vfpcc <16 x i8> @vidup_v16i8_1(i8 %index) { ; CHECK-LABEL: vidup_v16i8_1: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vmov.8 q0[2], r1 -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vmov.8 q0[3], r1 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: vmov.8 q0[4], r1 -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: vmov.8 q0[5], r1 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: vmov.8 q0[6], r1 -; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: vmov.8 q0[7], r1 -; CHECK-NEXT: add.w r1, r0, #8 -; CHECK-NEXT: vmov.8 q0[8], r1 -; CHECK-NEXT: add.w r1, r0, #9 -; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: add.w r1, r0, #10 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: add.w r1, r0, #11 -; CHECK-NEXT: vmov.8 q0[11], r1 -; CHECK-NEXT: add.w r1, r0, #12 -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: add.w r1, r0, #13 -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: add.w r1, r0, #14 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: adds r0, #15 -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vidup.u8 q0, r0, #1 ; CHECK-NEXT: bx lr %a1 = add i8 %index, 1 %a2 = add i8 %index, 2 @@ -347,37 +245,7 @@ define arm_aapcs_vfpcc <16 x i8> @vidup_v16i8_4(i8 %index) { ; CHECK-LABEL: vidup_v16i8_4: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: add.w r1, r0, #8 -; CHECK-NEXT: vmov.8 q0[2], r1 -; CHECK-NEXT: add.w r1, r0, #12 -; CHECK-NEXT: vmov.8 q0[3], r1 -; CHECK-NEXT: add.w r1, r0, #16 -; CHECK-NEXT: vmov.8 q0[4], r1 -; CHECK-NEXT: add.w r1, r0, #20 -; CHECK-NEXT: vmov.8 q0[5], r1 -; CHECK-NEXT: add.w r1, r0, #24 -; CHECK-NEXT: vmov.8 q0[6], r1 -; CHECK-NEXT: add.w r1, r0, #28 -; CHECK-NEXT: vmov.8 q0[7], r1 -; CHECK-NEXT: add.w r1, r0, #32 -; CHECK-NEXT: vmov.8 q0[8], r1 -; CHECK-NEXT: add.w r1, r0, #36 -; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: add.w r1, r0, #40 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: add.w r1, r0, #44 -; CHECK-NEXT: vmov.8 q0[11], r1 -; CHECK-NEXT: add.w r1, r0, #48 -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: add.w r1, r0, #52 -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: add.w r1, r0, #56 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: adds r0, #60 -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vidup.u8 q0, r0, #4 ; CHECK-NEXT: bx lr %a1 = add i8 %index, 4 %a2 = add i8 %index, 8