diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -310,7 +310,8 @@ def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>; -def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>; +def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0, 1>]>; def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>; def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>; @@ -2280,43 +2281,46 @@ } let AddedComplexity = 1 in { - class LD1RPat : - Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))), - (load (ptrue 31), GPR64:$base, $offset)>; + multiclass LD1RPat { + def : Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))), + (load (ptrue 31), GPR64:$base, $offset)>; + def : Pat<(vt (AArch64dup_mt PPR:$pg, (index_vt (operator (CP GPR64:$base, immtype:$offset))), (SVEDup0Undef))), + (load $pg, GPR64:$base, $offset)>; + } } // LDR1 of 8-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LDR1 of 16-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LDR1 of 32-bit data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LDR1 of 64-bit data - def : LD1RPat; + defm : LD1RPat; // LD1R of FP data - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; - def : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; // LD1R of 128-bit masked data multiclass ld1rq_pat{ diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -819,6 +819,547 @@ ret %3 } +; +; +; Tests for dup and dupx: +; +; Positive tests: +; * dup with passthru=undef or passthrue=zero. +; * dupx with an in-range immediate (hi, lo) for address offset. +; +; Negative tests: +; * dup with passthru as a parameter. +; * dupx with an out of range immediate for address offset +; +; + + +define @dup_ld1rqb_i8_passthruundef( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rqb_i8_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %res = call @llvm.aarch64.sve.dup.nxv16i8( undef, %pg, i8 %ld) + ret %res +} +define @dup_ld1rqh_i16_passthruundef( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rqh_i16_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ld) + ret %res +} +define @dup_ld1rqs_i32_passthruundef( %pg, i32* %addr) { +; CHECK-LABEL: dup_ld1rqs_i32_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ld) + ret %res +} +define @dup_ld1rqd_i64_passthruundef( %pg, i64* %addr) { +; CHECK-LABEL: dup_ld1rqd_i64_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i64, i64* %addr + %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ld) + ret %res +} +define @dup_ld1rqh_half_passthruundef( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rqh_half_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half %ld) + ret %res +} +define @dup_ld1rqs_float_passthruundef( %pg, float* %addr) { +; CHECK-LABEL: dup_ld1rqs_float_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f32( undef, %pg, float %ld) + ret %res +} +define @dup_ld1rqd_double_passthruundef( %pg, double* %addr) { +; CHECK-LABEL: dup_ld1rqd_double_passthruundef: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load double, double* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg, double %ld) + ret %res +} +define @dup_ld1rqb_i8_passthruzero( %pg, i8* %addr) { +; CHECK-LABEL: dup_ld1rqb_i8_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %res = call @llvm.aarch64.sve.dup.nxv16i8( zeroinitializer, %pg, i8 %ld) + ret %res +} +define @dup_ld1rqh_i16_passthruzero( %pg, i16* %addr) { +; CHECK-LABEL: dup_ld1rqh_i16_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %res = call @llvm.aarch64.sve.dup.nxv8i16( zeroinitializer, %pg, i16 %ld) + ret %res +} +define @dup_ld1rqs_i32_passthruzero( %pg, i32* %addr) { +; CHECK-LABEL: dup_ld1rqs_i32_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %res = call @llvm.aarch64.sve.dup.nxv4i32( zeroinitializer, %pg, i32 %ld) + ret %res +} +define @dup_ld1rqd_i64_passthruzero( %pg, i64* %addr) { +; CHECK-LABEL: dup_ld1rqd_i64_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load i64, i64* %addr + %res = call @llvm.aarch64.sve.dup.nxv2i64( zeroinitializer, %pg, i64 %ld) + ret %res +} +define @dup_ld1rqh_half_passthruzero( %pg, half* %addr) { +; CHECK-LABEL: dup_ld1rqh_half_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv8f16( zeroinitializer, %pg, half %ld) + ret %res +} +define @dup_ld1rqs_float_passthruzero( %pg, float* %addr) { +; CHECK-LABEL: dup_ld1rqs_float_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f32( zeroinitializer, %pg, float %ld) + ret %res +} +define @dup_ld1rqd_double_passthruzero( %pg, double* %addr) { +; CHECK-LABEL: dup_ld1rqd_double_passthruzero: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %ld = load double, double* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f64( zeroinitializer, %pg, double %ld) + ret %res +} +define @dupx_ld1rqb_i8_immlo(i8* %addr) { +; CHECK-LABEL: dupx_ld1rqb_i8_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0, #1] +; CHECK-NEXT: ret + %gep = getelementptr i8, i8* %addr, i32 1 + %ld = load i8, i8* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 %ld) + ret %res +} +define @dupx_ld1rqh_i16_immlo(i16* %addr) { +; CHECK-LABEL: dupx_ld1rqh_i16_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #2] +; CHECK-NEXT: ret + %gep = getelementptr i16, i16* %addr, i32 1 + %ld = load i16, i16* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 %ld) + ret %res +} +define @dupx_ld1rqs_i32_immlo(i32* %addr) { +; CHECK-LABEL: dupx_ld1rqs_i32_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #4] +; CHECK-NEXT: ret + %gep = getelementptr i32, i32* %addr, i32 1 + %ld = load i32, i32* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 %ld) + ret %res +} +define @dupx_ld1rqd_i64_immlo(i64* %addr) { +; CHECK-LABEL: dupx_ld1rqd_i64_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #8] +; CHECK-NEXT: ret + %gep = getelementptr i64, i64* %addr, i32 1 + %ld = load i64, i64* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 %ld) + ret %res +} +define @dupx_ld1rqh_half_immlo(half* %addr) { +; CHECK-LABEL: dupx_ld1rqh_half_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #2] +; CHECK-NEXT: ret + %gep = getelementptr half, half* %addr, i32 1 + %ld = load half, half* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8f16(half %ld) + ret %res +} +define @dupx_ld1rqs_float_immlo(float* %addr) { +; CHECK-LABEL: dupx_ld1rqs_float_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #4] +; CHECK-NEXT: ret + %gep = getelementptr float, float* %addr, i32 1 + %ld = load float, float* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4f32(float %ld) + ret %res +} +define @dupx_ld1rqd_double_immlo(double* %addr) { +; CHECK-LABEL: dupx_ld1rqd_double_immlo: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #8] +; CHECK-NEXT: ret + %gep = getelementptr double, double* %addr, i32 1 + %ld = load double, double* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2f64(double %ld) + ret %res +} +define @dupx_ld1rqb_i8_immhi(i8* %addr) { +; CHECK-LABEL: dupx_ld1rqb_i8_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0, #63] +; CHECK-NEXT: ret + %gep = getelementptr i8, i8* %addr, i32 63 + %ld = load i8, i8* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 %ld) + ret %res +} +define @dupx_ld1rqh_i16_immhi(i16* %addr) { +; CHECK-LABEL: dupx_ld1rqh_i16_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] +; CHECK-NEXT: ret + %gep = getelementptr i16, i16* %addr, i32 63 + %ld = load i16, i16* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 %ld) + ret %res +} +define @dupx_ld1rqs_i32_immhi(i32* %addr) { +; CHECK-LABEL: dupx_ld1rqs_i32_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] +; CHECK-NEXT: ret + %gep = getelementptr i32, i32* %addr, i32 63 + %ld = load i32, i32* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 %ld) + ret %res +} +define @dupx_ld1rqd_i64_immhi(i64* %addr) { +; CHECK-LABEL: dupx_ld1rqd_i64_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] +; CHECK-NEXT: ret + %gep = getelementptr i64, i64* %addr, i32 63 + %ld = load i64, i64* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 %ld) + ret %res +} +define @dupx_ld1rqh_half_immhi(half* %addr) { +; CHECK-LABEL: dupx_ld1rqh_half_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] +; CHECK-NEXT: ret + %gep = getelementptr half, half* %addr, i32 63 + %ld = load half, half* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8f16(half %ld) + ret %res +} +define @dupx_ld1rqs_float_immhi(float* %addr) { +; CHECK-LABEL: dupx_ld1rqs_float_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] +; CHECK-NEXT: ret + %gep = getelementptr float, float* %addr, i32 63 + %ld = load float, float* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4f32(float %ld) + ret %res +} +define @dupx_ld1rqd_double_immhi(double* %addr) { +; CHECK-LABEL: dupx_ld1rqd_double_immhi: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] +; CHECK-NEXT: ret + %gep = getelementptr double, double* %addr, i32 63 + %ld = load double, double* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2f64(double %ld) + ret %res +} +define @negtest_dup_ld1rqb_i8_passthru( %pt, %pg, i8* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqb_i8_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret + %ld = load i8, i8* %addr + %res = call @llvm.aarch64.sve.dup.nxv16i8( %pt, %pg, i8 %ld) + ret %res +} +define @negtest_dup_ld1rqh_i16_passthru( %pt, %pg, i16* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqh_i16_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: mov z0.h, p0/m, w8 +; CHECK-NEXT: ret + %ld = load i16, i16* %addr + %res = call @llvm.aarch64.sve.dup.nxv8i16( %pt, %pg, i16 %ld) + ret %res +} +define @negtest_dup_ld1rqs_i32_passthru( %pt, %pg, i32* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqs_i32_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: ret + %ld = load i32, i32* %addr + %res = call @llvm.aarch64.sve.dup.nxv4i32( %pt, %pg, i32 %ld) + ret %res +} +define @negtest_dup_ld1rqd_i64_passthru( %pt, %pg, i64* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqd_i64_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: ret + %ld = load i64, i64* %addr + %res = call @llvm.aarch64.sve.dup.nxv2i64( %pt, %pg, i64 %ld) + ret %res +} +define @negtest_dup_ld1rqh_half_passthru( %pt, %pg, half* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqh_half_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: ret + %ld = load half, half* %addr + %res = call @llvm.aarch64.sve.dup.nxv8f16( %pt, %pg, half %ld) + ret %res +} +define @negtest_dup_ld1rqs_float_passthru( %pt, %pg, float* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqs_float_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: ret + %ld = load float, float* %addr + %res = call @llvm.aarch64.sve.dup.nxv4f32( %pt, %pg, float %ld) + ret %res +} +define @negtest_dup_ld1rqd_double_passthru( %pt, %pg, double* %addr) { +; CHECK-LABEL: negtest_dup_ld1rqd_double_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: ret + %ld = load double, double* %addr + %res = call @llvm.aarch64.sve.dup.nxv2f64( %pt, %pg, double %ld) + ret %res +} +define @negtest_dupx_ld1rqb_i8_immlo_outrange(i8* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqb_i8_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i8, i8* %addr, i32 -1 + %ld = load i8, i8* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 %ld) + ret %res +} +define @negtest_dupx_ld1rqh_i16_immlo_outrange(i16* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqh_i16_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i16, i16* %addr, i32 -1 + %ld = load i16, i16* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 %ld) + ret %res +} +define @negtest_dupx_ld1rqs_i32_immlo_outrange(i32* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqs_i32_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i32, i32* %addr, i32 -1 + %ld = load i32, i32* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 %ld) + ret %res +} +define @negtest_dupx_ld1rqd_i64_immlo_outrange(i64* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqd_i64_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i64, i64* %addr, i32 -1 + %ld = load i64, i64* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 %ld) + ret %res +} +define @negtest_dupx_ld1rqh_half_immlo_outrange(half* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqh_half_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr half, half* %addr, i32 -1 + %ld = load half, half* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8f16(half %ld) + ret %res +} +define @negtest_dupx_ld1rqs_float_immlo_outrange(float* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqs_float_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #4 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr float, float* %addr, i32 -1 + %ld = load float, float* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4f32(float %ld) + ret %res +} +define @negtest_dupx_ld1rqd_double_immlo_outrange(double* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqd_double_immlo_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr double, double* %addr, i32 -1 + %ld = load double, double* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2f64(double %ld) + ret %res +} +define @negtest_dupx_ld1rqb_i8_immhi_outrange(i8* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqb_i8_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #64 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i8, i8* %addr, i32 64 + %ld = load i8, i8* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 %ld) + ret %res +} + +define @negtest_dupx_ld1rqh_i16_immhi_outrange(i16* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqh_i16_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #128 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i16, i16* %addr, i32 64 + %ld = load i16, i16* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 %ld) + ret %res +} + +define @negtest_dupx_ld1rqs_i32_immhi_outrange(i32* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqs_i32_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #256 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i32, i32* %addr, i32 64 + %ld = load i32, i32* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 %ld) + ret %res +} + +define @negtest_dupx_ld1rqd_i64_immhi_outrange(i64* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqd_i64_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #512 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr i64, i64* %addr, i32 64 + %ld = load i64, i64* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 %ld) + ret %res +} + +define @negtest_dupx_ld1rqh_half_immhi_outrange(half* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqh_half_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #128 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr half, half* %addr, i32 64 + %ld = load half, half* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv8f16(half %ld) + ret %res +} + +define @negtest_dupx_ld1rqs_float_immhi_outrange(float* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqs_float_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #256 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr float, float* %addr, i32 64 + %ld = load float, float* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv4f32(float %ld) + ret %res +} + +define @negtest_dupx_ld1rqd_double_immhi_outrange(double* %addr) { +; CHECK-LABEL: negtest_dupx_ld1rqd_double_immhi_outrange: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #512 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr double, double* %addr, i32 64 + %ld = load double, double* %gep + %res = call @llvm.aarch64.sve.dup.x.nxv2f64(double %ld) + ret %res +} + + declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) @@ -837,4 +1378,21 @@ declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) declare @llvm.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) +declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) +declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) +declare @llvm.aarch64.sve.dup.nxv2i64(, , i64) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) +declare @llvm.aarch64.sve.dup.nxv4f32(, , float) +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) + +declare @llvm.aarch64.sve.dup.x.nxv16i8(i8) +declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) +declare @llvm.aarch64.sve.dup.x.nxv8f16(half) +declare @llvm.aarch64.sve.dup.x.nxv4f32(float) +declare @llvm.aarch64.sve.dup.x.nxv2f64(double) + + attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s @@ -3416,22 +3416,22 @@ # CHECK-NEXT: 1 19 4.00 * U ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 11 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.b }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.b }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rd { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rd { z31.d }, p7/z, [sp, #504] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.h }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rh { z31.s }, p7/z, [sp, #126] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z21.b }, p5/z, [x10, #112] @@ -3452,22 +3452,22 @@ # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #112] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z31.s }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rsh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rsw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 11 0.50 * U ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 11 0.50 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 11 0.50 * ld1rw { z31.s }, p7/z, [sp, #252] # CHECK-NEXT: 1 11 0.50 * U ld1sb { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 16 2.00 * U ld1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 11 0.50 * ld1sb { z0.h }, p0/z, [sp, x0] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s @@ -4472,22 +4472,22 @@ # CHECK-NEXT: 2 9 0.50 * U ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 6 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.b }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.b }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rd { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rd { z31.d }, p7/z, [sp, #504] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.h }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rh { z31.s }, p7/z, [sp, #126] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z21.b }, p5/z, [x10, #112] @@ -4508,22 +4508,22 @@ # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #112] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z31.s }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.d }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.h }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsb { z31.s }, p7/z, [sp, #63] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.d }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rsh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rsw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.d }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z0.s }, p0/z, [x0] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.d }, p7/z, [sp, #252] -# CHECK-NEXT: 1 6 0.33 * U ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 6 0.33 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 6 0.33 * ld1rw { z31.s }, p7/z, [sp, #252] # CHECK-NEXT: 1 6 0.33 * U ld1sb { z0.d }, p0/z, [x0] # CHECK-NEXT: 4 9 1.00 * U ld1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 6 0.50 * ld1sb { z0.h }, p0/z, [sp, x0]