diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -327,7 +327,8 @@ def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>; -def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>; +def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0, 1>]>; def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>; def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>; @@ -2297,43 +2298,46 @@ } let AddedComplexity = 1 in { - class LD1RPat : - Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))), - (load (ptrue 31), GPR64:$base, $offset)>; + multiclass LD1RPat { + def : Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))), + (load (ptrue 31), GPR64:$base, $offset)>; + def : Pat<(vt (AArch64dup_mt PPR:$pg, (index_vt (operator (CP GPR64:$base, immtype:$offset))), (SVEDup0Undef))), + (load $pg, GPR64:$base, $offset)>; + } } // LDR1 of 8-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LDR1 of 16-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LDR1 of 32-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LDR1 of 64-bit data - def : LD1RPat; + defm : LD1RPat; // LD1R of FP data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LD1R of 128-bit masked data multiclass ld1rq_pat{ diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -819,6 +819,373 @@ ret %3 } +; +; +; Tests for dup: +; +; Positive tests: +; * dup with passthru=undef or passthrue=zero. +; * sign/zero extending. +; * unpacked types. +; +; Negative tests: +; * dup with passthru as a parameter. +; +; + +define @dup_ld1rb_i8_passthruundef_nxv16i8( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rb_i8_passthruundef_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %res = call @llvm.aarch64.sve.dup.nxv16i8( undef, %pg, i8 %ld) + ret %res +} +define @dup_ld1rh_i16_passthruundef_nxv8i16( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rh_i16_passthruundef_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ld) + ret %res +} +define @dup_ld1rh_i8_passthruundef_nxv8i16_sext( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %ext = sext i8 %ld to i16 + %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ext) + ret %res +} +define @dup_ld1rh_i8_passthruundef_nxv8i16_zext( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %ext = zext i8 %ld to i16 + %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ext) + ret %res +} +define @dup_ld1rs_i32_passthruundef_nxv4i32( %pg, i32* %addr) { +; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ld) + ret %res +} +define @dup_ld1rs_i8_passthruundef_nxv4i32_sext( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %ext = sext i8 %ld to i32 + %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) + ret %res +} +define @dup_ld1rs_i8_passthruundef_nxv4i32_zext( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %ext = zext i8 %ld to i32 + %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) + ret %res +} +define @dup_ld1rs_i16_passthruundef_nxv4i32_sext( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %ext = sext i16 %ld to i32 + %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) + ret %res +} +define @dup_ld1rs_i16_passthruundef_nxv4i32_zext( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %ext = zext i16 %ld to i32 + %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) + ret %res +} +define @dup_ld1rd_i64_passthruundef_nxv2i64( %pg, i64* %addr) { +; CHECK-LABEL: dup_ld1rd_i64_passthruundef_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i64, i64* %addr + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ld) + ret %res +} +define @dup_ld1rs_i8_passthruundef_nxv2i64_sext( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %ext = sext i8 %ld to i64 + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) + ret %res +} +define @dup_ld1rs_i8_passthruundef_nxv2i64_zext( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %ext = zext i8 %ld to i64 + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) + ret %res +} +define @dup_ld1rs_i16_passthruundef_nxv2i64_sext( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %ext = sext i16 %ld to i64 + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) + ret %res +} +define @dup_ld1rs_i16_passthruundef_nxv2i64_zext( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %ext = zext i16 %ld to i64 + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) + ret %res +} +define @dup_ld1rs_i32_passthruundef_nxv2i64_sext( %pg, i32* %addr) { +; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %ext = sext i32 %ld to i64 + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) + ret %res +} +define @dup_ld1rs_i32_passthruundef_nxv2i64_zext( %pg, i32* %addr) { +; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %ext = zext i32 %ld to i64 + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) + ret %res +} +define @dup_ld1rh_half_passthruundef_nxv8f16( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half %ld) + ret %res +} +define @dup_ld1rs_float_passthruundef_nxv4f32( %pg, float* %addr) { +; CHECK-LABEL: dup_ld1rs_float_passthruundef_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f32( undef, %pg, float %ld) + ret %res +} +define @dup_ld1rd_double_passthruundef_nxv2f64( %pg, double* %addr) { +; CHECK-LABEL: dup_ld1rd_double_passthruundef_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load double, double* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg, double %ld) + ret %res +} +define @dup_ld1rh_half_passthruundef_nxv4f16( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f16( undef, %pg, half %ld) + ret %res +} +define @dup_ld1rb_i8_passthruzero_nxv16i8( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rb_i8_passthruzero_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %res = call @llvm.aarch64.sve.dup.nxv16i8( zeroinitializer, %pg, i8 %ld) + ret %res +} +define @dup_ld1rh_i16_passthruzero_nxv8i16( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rh_i16_passthruzero_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %res = call @llvm.aarch64.sve.dup.nxv8i16( zeroinitializer, %pg, i16 %ld) + ret %res +} +define @dup_ld1rs_i32_passthruzero_nxv4i32( %pg, i32* %addr) { +; CHECK-LABEL: dup_ld1rs_i32_passthruzero_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %res = call @llvm.aarch64.sve.dup.nxv4i32( zeroinitializer, %pg, i32 %ld) + ret %res +} +define @dup_ld1rd_i64_passthruzero_nxv2i64( %pg, i64* %addr) { +; CHECK-LABEL: dup_ld1rd_i64_passthruzero_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i64, i64* %addr + %res = call @llvm.aarch64.sve.dup.nxv2i64( zeroinitializer, %pg, i64 %ld) + ret %res +} +define @dup_ld1rh_half_passthruzero_nxv8f16( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv8f16( zeroinitializer, %pg, half %ld) + ret %res +} +define @dup_ld1rs_float_passthruzero_nxv4f32( %pg, float* %addr) { +; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f32( zeroinitializer, %pg, float %ld) + ret %res +} +define @dup_ld1rd_double_passthruzero_nxv2f64( %pg, double* %addr) { +; CHECK-LABEL: dup_ld1rd_double_passthruzero_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load double, double* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f64( zeroinitializer, %pg, double %ld) + ret %res +} +define @dup_ld1rh_half_passthruzero_nxv4f16( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f16( zeroinitializer, %pg, half %ld) + ret %res +} +define @dup_ld1rh_half_passthruzero_nxv2f16( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f16( zeroinitializer, %pg, half %ld) + ret %res +} +define @dup_ld1rs_float_passthruzero_nxv2f32( %pg, float* %addr) { +; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f32( zeroinitializer, %pg, float %ld) + ret %res +} +define @negtest_dup_ld1rb_i8_passthru_nxv16i8( %pt, %pg, i8* %addr) { +; CHECK-LABEL: negtest_dup_ld1rb_i8_passthru_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %res = call @llvm.aarch64.sve.dup.nxv16i8( %pt, %pg, i8 %ld) + ret %res +} +define @negtest_dup_ld1rh_i16_passthru_nxv8i16( %pt, %pg, i16* %addr) { +; CHECK-LABEL: negtest_dup_ld1rh_i16_passthru_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: mov z0.h, p0/m, w8 +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %res = call @llvm.aarch64.sve.dup.nxv8i16( %pt, %pg, i16 %ld) + ret %res +} +define @negtest_dup_ld1rs_i32_passthru_nxv4i32( %pt, %pg, i32* %addr) { +; CHECK-LABEL: negtest_dup_ld1rs_i32_passthru_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %res = call @llvm.aarch64.sve.dup.nxv4i32( %pt, %pg, i32 %ld) + ret %res +} +define @negtest_dup_ld1rd_i64_passthru_nxv2i64( %pt, %pg, i64* %addr) { +; CHECK-LABEL: negtest_dup_ld1rd_i64_passthru_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: ret + %ld = load i64, i64* %addr + %res = call @llvm.aarch64.sve.dup.nxv2i64( %pt, %pg, i64 %ld) + ret %res +} +define @negtest_dup_ld1rh_half_passthru_nxv8f16( %pt, %pg, half* %addr) { +; CHECK-LABEL: negtest_dup_ld1rh_half_passthru_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv8f16( %pt, %pg, half %ld) + ret %res +} +define @negtest_dup_ld1rs_float_passthru_nxv4f32( %pt, %pg, float* %addr) { +; CHECK-LABEL: negtest_dup_ld1rs_float_passthru_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f32( %pt, %pg, float %ld) + ret %res +} +define @negtest_dup_ld1rd_double_passthru_nxv2f64( %pt, %pg, double* %addr) { +; CHECK-LABEL: negtest_dup_ld1rd_double_passthru_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: ret + %ld = load double, double* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f64( %pt, %pg, double %ld) + ret %res +} + declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) @@ -837,4 +1204,16 @@ declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) declare @llvm.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) +declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) +declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) +declare @llvm.aarch64.sve.dup.nxv2i64(, , i64) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) +declare @llvm.aarch64.sve.dup.nxv4f32(, , float) +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) +declare @llvm.aarch64.sve.dup.nxv4f16(, , half) +declare @llvm.aarch64.sve.dup.nxv2f16(, , half) +declare @llvm.aarch64.sve.dup.nxv2f32(, , float) + + attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s @@ -3416,22 +3416,22 @@ # CHECK-NEXT: 1 19 4.00 * U ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.b }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.b }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rd { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rd { z31.d }, p7/z, [sp, #504] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.h }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.s }, p7/z, [sp, #126] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z21.b }, p5/z, [x10, #112] @@ -3452,22 +3452,22 @@ # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #112] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z31.s }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.s }, p7/z, [sp, #252] # CHECK-NEXT: 1 11 0.50 * U ld1sb { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 16 2.00 * U ld1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 11 0.50 * ld1sb { z0.h }, p0/z, [sp, x0] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s @@ -4472,22 +4472,22 @@ # CHECK-NEXT: 2 9 0.50 * U ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.b }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.b }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rd { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rd { z31.d }, p7/z, [sp, #504] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.h }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.s }, p7/z, [sp, #126] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z21.b }, p5/z, [x10, #112] @@ -4508,22 +4508,22 @@ # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #112] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z31.s }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.s }, p7/z, [sp, #252] # CHECK-NEXT: 1 6 0.33 * U ld1sb { z0.d }, p0/z, [x0] # CHECK-NEXT: 4 9 1.00 * U ld1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 6 0.50 * ld1sb { z0.h }, p0/z, [sp, x0]