diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -357,6 +357,8 @@ LDNF1S_MERGE_ZERO, LDFF1_MERGE_ZERO, LDFF1S_MERGE_ZERO, + LD1R_MERGE_ZERO, + LD1RS_MERGE_ZERO, LD1RQ_MERGE_ZERO, LD1RO_MERGE_ZERO, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -922,6 +922,8 @@ setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::SPLAT_VECTOR); + setTargetDAGCombine(ISD::GlobalAddress); // In case of strict alignment, avoid an excessive number of byte wide stores. @@ -2360,6 +2362,8 @@ MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1R_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RS_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) @@ -20161,6 +20165,66 @@ return performPostLD1Combine(N, DCI, false); } +static SDValue performDUPMergePassthruOrSplatCombine(SDNode *N, + SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || !VT.isScalableVector() || + VT.getVectorElementType() == MVT::i1) + return SDValue(); + + bool IsSplat = N->getOpcode() == ISD::SPLAT_VECTOR; + SDValue Pred = + IsSplat ? getPredicateForScalableVector(DAG, DL, VT) : N->getOperand(0); + SDValue Load = N->getOperand(IsSplat ? 0 : 1); + + if (!IsSplat) { + SDValue Passthru = N->getOperand(2); + + if (Passthru.getOpcode() != ISD::UNDEF && + !isZerosVector(Passthru.getNode()) && !isAllActivePredicate(DAG, Pred)) + return SDValue(); + } + + if (Load->getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LoadSDN = cast(Load); + EVT MemVT = LoadSDN->getMemoryVT(); + SDValue Offset = LoadSDN->getOffset(); + + unsigned Opcode = LoadSDN->getExtensionType() == ISD::SEXTLOAD + ? AArch64ISD::LD1RS_MERGE_ZERO + : AArch64ISD::LD1R_MERGE_ZERO; + + if (Offset.isUndef()) + Offset = DAG.getTargetConstant(0, DL, MVT::i64); + + uint64_t Bytes = MemVT.getFixedSizeInBits() / 8; + ConstantSDNode *C = dyn_cast(Offset); + if (!C || C->getZExtValue() >= (1 << 7) * Bytes || + C->getZExtValue() % Bytes != 0) + return SDValue(); + + // Check if there are other uses. If so, do not combine as it will introduce + // an extra load. + for (SDNode::use_iterator UI = Load->use_begin(), UE = Load->use_end(); + UI != UE; ++UI) { + if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. + continue; + if (*UI != N) + return SDValue(); + } + + SDValue Ops[] = {LoadSDN->getChain(), Pred, LoadSDN->getBasePtr(), Offset, + DAG.getValueType(getPackedSVEVectorVT(MemVT))}; + SDValue NewLoad = DAG.getNode(Opcode, DL, {VT, MVT::Other}, Ops); + SDValue LoadChain = SDValue(NewLoad.getNode(), 1); + + return DAG.getMergeValues({NewLoad, LoadChain}, DL); +} + /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) @@ -20908,6 +20972,8 @@ return performSVESpliceCombine(N, DAG); case ISD::FP_EXTEND: return performFPExtendCombine(N, DAG, DCI, Subtarget); + case ISD::SPLAT_VECTOR: + return performDUPMergePassthruOrSplatCombine(N, DAG); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: @@ -20964,6 +21030,8 @@ case AArch64ISD::UMULL: case AArch64ISD::PMULL: return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG); + case AArch64ISD::DUP_MERGE_PASSTHRU: + return performDUPMergePassthruOrSplatCombine(N, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -508,6 +508,27 @@ let ParserMatchClass = UImm6s16Operand; } +// tuimm6sN predicate - similiar to uimm6sN, but use TImmLeaf (TargetConstant) +// instead of ImmLeaf (Constant) +def tuimm6s1 : Operand, TImmLeaf= 0 && Imm < 64; }]> { + let ParserMatchClass = UImm6s1Operand; +} +def tuimm6s2 : Operand, TImmLeaf= 0 && Imm < (64*2) && ((Imm % 2) == 0); }]> { + let PrintMethod = "printImmScale<2>"; + let ParserMatchClass = UImm6s2Operand; +} +def tuimm6s4 : Operand, TImmLeaf= 0 && Imm < (64*4) && ((Imm % 4) == 0); }]> { + let PrintMethod = "printImmScale<4>"; + let ParserMatchClass = UImm6s4Operand; +} +def tuimm6s8 : Operand, TImmLeaf= 0 && Imm < (64*8) && ((Imm % 8) == 0); }]> { + let PrintMethod = "printImmScale<8>"; + let ParserMatchClass = UImm6s8Operand; +} + def SImmS2XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64); }]>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -50,9 +50,15 @@ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; +def SDT_AArch64_LD1Replicate_Imm : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisInt<3>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1r_z : SDNode<"AArch64ISD::LD1R_MERGE_ZERO", SDT_AArch64_LD1Replicate_Imm, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1rs_z : SDNode<"AArch64ISD::LD1RS_MERGE_ZERO", SDT_AArch64_LD1Replicate_Imm, [SDNPHasChain, SDNPMayLoad]>; // Gather loads - node definitions // @@ -2276,46 +2282,52 @@ def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; } - let AddedComplexity = 1 in { - class LD1RPat : - Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))), - (load (ptrue 31), GPR64:$base, $offset)>; + multiclass LD1RPat { + let AddedComplexity = 1 in { + def : Pat<(vt (operator PPR:$pg, GPR64:$base, timmtype:$offset, memvt)), + (load_instr $pg, $base, $offset)>; + def : Pat<(vt (operator PPR:$pg, (CP GPR64:$base, immtype:$offset), (i64 0), memvt)), + (load_instr $pg, $base, $offset)>; + } } - // LDR1 of 8-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - - // LDR1 of 16-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - - // LDR1 of 32-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - - // LDR1 of 64-bit data - def : LD1RPat; + // LD1R of 8-bit data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LD1R of 16-bit data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LD1R of 32-bit data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LD1R of 64-bit data + defm : LD1RPat; // LD1R of FP data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - -// LD1R of 128-bit masked data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LD1R of 128-bit masked data multiclass ld1rq_pat{ def : Pat<(vt1 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), (!cast(load_instr # _IMM) $gp, $base, (i64 0))>; diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -723,6 +723,155 @@ ret %shf } +define @ld1rb_dup(i8* %valp, %pg) #0 { +; CHECK-LABEL: ld1rb_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv16i8( undef, %pg, i8 %val) + ret %ret +} + +define @ld1rh_dup(i16* %valp, %pg) #0 { +; CHECK-LABEL: ld1rh_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i16, i16* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %val) + ret %ret +} + +define @ld1rw_dup(i32* %valp, %pg) #0 { +; CHECK-LABEL: ld1rw_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i32, i32* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %val) + ret %ret +} + +define @ld1rd_dup(i64* %valp, %pg) #0 { +; CHECK-LABEL: ld1rd_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i64, i64* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %val) + ret %ret +} + +define @ld1rh_bfloat_dup(bfloat* %valp, %pg) #0 { +; CHECK-LABEL: ld1rh_bfloat_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load bfloat, bfloat* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv8bf16( undef, %pg, bfloat %val) + ret %ret +} + +define @ld1rh_half_dup(half* %valp, %pg) #0 { +; CHECK-LABEL: ld1rh_half_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load half, half* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half %val) + ret %ret +} + +define @ld1rw_float_dup(float* %valp, %pg) #0 { +; CHECK-LABEL: ld1rw_float_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load float, float* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv4f32( undef, %pg, float %val) + ret %ret +} + +define @ld1rd_double_dup(double* %valp, %pg) #0 { +; CHECK-LABEL: ld1rd_double_dup: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load double, double* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg, double %val) + ret %ret +} + +; Combine when passthru is zero +define @ld1rb_dup_zero_pt(i8* %valp, %pg) #0 { +; CHECK-LABEL: ld1rb_dup_zero_pt: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv16i8( zeroinitializer, %pg, i8 %val) + ret %ret +} + +; Combine when passthru is non-zero/undef but pred is all active +define @ld1rb_dup_pred_true(i8* %valp, %pt) #0 { +; CHECK-LABEL: ld1rb_dup_pred_true: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %ret = tail call @llvm.aarch64.sve.dup.nxv16i8( %pt, %pg, i8 %val) + ret %ret +} + +; Combine when load would otherwise be a post-inc +define void @ld1rb_dup_postinc(i8* %valp, * %out, %pg) #0 { +; CHECK-LABEL: ld1rb_dup_postinc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #-2 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: .LBB63_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: add w8, w8, #2 +; CHECK-NEXT: add x0, x0, #2 +; CHECK-NEXT: cmp w8, #100 +; CHECK-NEXT: st1b { z0.b }, p1, [x1] +; CHECK-NEXT: b.lo .LBB63_1 +; CHECK-NEXT: // %bb.2: // %for.end +; CHECK-NEXT: ret +entry: + br label %for.body + +for.body: + %inc = phi i32 [ 0, %entry ], [ %inc.next, %for.body ] + %ptr = getelementptr i8, i8* %valp, i32 %inc + %val = load i8, i8* %ptr + %load = tail call @llvm.aarch64.sve.dup.nxv16i8( undef, %pg, i8 %val) + store volatile %load, * %out + %inc.next = add i32 %inc, 2 + %cond = icmp uge i32 %inc, 100 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; Don't combine when passthru is non-zero/undef and pred is not all active +define @ld1rb_dup_nonzero_pt(i8* %valp, %pg, %pt) #0 { +; CHECK-LABEL: ld1rb_dup_nonzero_pt: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ret = tail call @llvm.aarch64.sve.dup.nxv16i8( %pt, %pg, i8 %val) + ret %ret +} + define @dupq_ld1rqd_f64(<2 x double>* %a) { ; CHECK-LABEL: dupq_ld1rqd_f64: ; CHECK: // %bb.0: @@ -819,6 +968,17 @@ ret %3 } +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) + +declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) +declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) +declare @llvm.aarch64.sve.dup.nxv2i64(, , i64) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) +declare @llvm.aarch64.sve.dup.nxv8bf16(, , bfloat) +declare @llvm.aarch64.sve.dup.nxv4f32(, , float) +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) + declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) diff --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s @@ -3416,22 +3416,22 @@ # CHECK-NEXT: 1 19 4.00 * U ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.b }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.b }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rd { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rd { z31.d }, p7/z, [sp, #504] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.h }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.s }, p7/z, [sp, #126] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z21.b }, p5/z, [x10, #112] @@ -3452,22 +3452,22 @@ # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #112] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z31.s }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.s }, p7/z, [sp, #252] # CHECK-NEXT: 1 11 0.50 * U ld1sb { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 16 2.00 * U ld1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 11 0.50 * ld1sb { z0.h }, p0/z, [sp, x0] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s @@ -4472,22 +4472,22 @@ # CHECK-NEXT: 2 9 0.50 * U ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.b }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.b }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rd { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rd { z31.d }, p7/z, [sp, #504] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.h }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.s }, p7/z, [sp, #126] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z21.b }, p5/z, [x10, #112] @@ -4508,22 +4508,22 @@ # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #112] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z31.s }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.s }, p7/z, [sp, #252] # CHECK-NEXT: 1 6 0.33 * U ld1sb { z0.d }, p0/z, [x0] # CHECK-NEXT: 4 9 1.00 * U ld1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 6 0.50 * ld1sb { z0.h }, p0/z, [sp, x0]