diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -937,15 +937,20 @@ defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>; defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>; - let AddedComplexity = 1 in { - class LD1RQPat : - Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))), - (load_instr (ptrue 31), GPR64sp:$Xn, 0)>; + multiclass sve_ld1rq_duplane_pat { + def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))), + (load_instr_imm (ptrue 31), GPR64sp:$Xn, 0)>; + let AddedComplexity = 2 in { + def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (add GPR64sp:$Xn, simm4s16:$imm))), (i64 0))), (i64 0))), + (load_instr_imm (ptrue 31), GPR64sp:$Xn, simm4s16:$imm)>; + } + def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (AddrCP GPR64sp:$Xn, GPR64sp:$idx))), (i64 0))), (i64 0))), + (load_instr_scalar (ptrue 31), GPR64sp:$Xn, $idx)>; } - def : LD1RQPat; - def : LD1RQPat; - def : LD1RQPat; - def : LD1RQPat; + defm : sve_ld1rq_duplane_pat; + defm : sve_ld1rq_duplane_pat; + defm : sve_ld1rq_duplane_pat; + defm : sve_ld1rq_duplane_pat; // continuous load with reg+reg addressing. defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>; @@ -2260,24 +2265,22 @@ def : LD1RPat; def : LD1RPat; - // LD1R of 128-bit masked data - def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), - (LD1RQ_B_IMM $gp, $base, (i64 0))>; - def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), - (LD1RQ_H_IMM $gp, $base, (i64 0))>; - def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), - (LD1RQ_W_IMM $gp, $base, (i64 0))>; - def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), - (LD1RQ_D_IMM $gp, $base, (i64 0))>; - - def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), - (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>; - def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), - (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>; - def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), - (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>; - def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), - (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>; +// LD1R of 128-bit masked data + multiclass ld1rq_pat{ + def : Pat<(vt1 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (!cast(load_instr # _IMM) $gp, $base, (i64 0))>; + let AddedComplexity = 2 in { + def : Pat<(vt1 (op PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (!cast(load_instr # _IMM) $gp, $base, simm4s16:$imm)>; + } + def : Pat<(vt1 (op PPR:$gp, (AddrCP GPR64:$base, GPR64:$idx))), + (load_instr $gp, $base, $idx)>; + } + + defm : ld1rq_pat; + defm : ld1rq_pat; + defm : ld1rq_pat; + defm : ld1rq_pat; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -24,6 +24,16 @@ ret %res } +define @ld1rqb_i8_scalar( %pred, i8* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqb_i8_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + define @ld1rqb_i8_imm_lower_bound( %pred, i8* %addr) { ; CHECK-LABEL: ld1rqb_i8_imm_lower_bound: ; CHECK: // %bb.0: @@ -47,8 +57,8 @@ define @ld1rqb_i8_imm_out_of_lower_bound( %pred, i8* %addr) { ; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #129 -; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] +; CHECK-NEXT: mov x8, #-129 +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, i8* %addr, i64 -129 %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) @@ -58,14 +68,41 @@ define @ld1rqb_i8_imm_out_of_upper_bound( %pred, i8* %addr) { ; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #113 -; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] +; CHECK-NEXT: mov w8, #113 +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, i8* %addr, i64 113 %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) ret %res } +define @ld1rqb_i8_imm_dupqlane( %pred, <16 x i8>* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <16 x i8>, <16 x i8>* %addr, i16 -1 + %load = load <16 x i8>, <16 x i8>* %ptr + %1 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0) + ret %2 +} + +define @ld1rqb_i8_scalar_dupqlane( %pred, i8* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqb_i8_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx + %ptr_bitcast = bitcast i8* %ptr to <16 x i8>* + %load = load <16 x i8>, <16 x i8>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0) + ret %2 +} + ; ; LD1RQH ; @@ -108,6 +145,26 @@ ret %res } +define @ld1rqh_i16_scalar( %pred, i16* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqh_i16_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv8i16( %pred, i16* %ptr) + ret %res +} + +define @ld1rqh_f16_scalar( %pred, half* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqh_f16_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv8f16( %pred, half* %ptr) + ret %res +} + define @ld1rqh_bf16( %pred, bfloat* %addr) { ; CHECK-LABEL: ld1rqh_bf16: ; CHECK: // %bb.0: @@ -127,6 +184,97 @@ ret %res } +define @ld1rqh_bf16_scalar( %pred, bfloat* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqh_bf16_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv8bf16( %pred, bfloat* %ptr) + ret %res +} + +define @ld1rqh_i16_imm_dupqlane( %pred, <8 x i16>* %addr) { +; CHECK-LABEL: ld1rqh_i16_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <8 x i16>, <8 x i16>* %addr, i16 -1 + %load = load <8 x i16>, <8 x i16>* %ptr + %1 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) + ret %2 +} + +define @ld1rqh_i16_scalar_dupqlane( %pred, i16* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqh_i16_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx + %ptr_bitcast = bitcast i16* %ptr to <8 x i16>* + %load = load <8 x i16>, <8 x i16>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) + ret %2 +} + +define @ld1rqh_f16_imm_dupqlane( %pred, <8 x half>* %addr) { +; CHECK-LABEL: ld1rqh_f16_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <8 x half>, <8 x half>* %addr, i16 -1 + %load = load <8 x half>, <8 x half>* %ptr + %1 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) + ret %2 +} + +define @ld1rqh_f16_scalar_dupqlane( %pred, half* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqh_f16_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %idx + %ptr_bitcast = bitcast half* %ptr to <8 x half>* + %load = load <8 x half>, <8 x half>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) + ret %2 +} + +define @ld1rqh_bf16_imm_dupqlane( %pred, <8 x bfloat>* %addr) { +; CHECK-LABEL: ld1rqh_bf16_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <8 x bfloat>, <8 x bfloat>* %addr, i16 -1 + %load = load <8 x bfloat>, <8 x bfloat>* %ptr + %1 = tail call @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) + ret %2 +} + +define @ld1rqh_bf16_scalar_dupqlane( %pred, bfloat* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqh_bf16_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx + %ptr_bitcast = bitcast bfloat* %ptr to <8 x bfloat>* + %load = load <8 x bfloat>, <8 x bfloat>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) + ret %2 +} + ; ; LD1RQW ; @@ -169,6 +317,80 @@ ret %res } +define @ld1rqw_i32_scalar( %pred, i32* %base, i64 %idx) { +; CHECK-LABEL: ld1rqw_i32_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %base, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv4i32( %pred, i32* %ptr) + ret %res +} + +define @ld1rqw_f32_scalar( %pred, float* %base, i64 %idx) { +; CHECK-LABEL: ld1rqw_f32_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %base, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv4f32( %pred, float* %ptr) + ret %res +} + +define @ld1rqw_i32_imm_dupqlane( %pred, <4 x i32>* %addr) { +; CHECK-LABEL: ld1rqw_i32_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %addr, i32 1 + %load = load <4 x i32>, <4 x i32>* %ptr + %1 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) + ret %2 +} + +define @ld1rqw_i32_scalar_dupqlane( %pred, i32* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqw_i32_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %idx + %ptr_bitcast = bitcast i32* %ptr to <4 x i32>* + %load = load <4 x i32>, <4 x i32>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) + ret %2 +} + +define @ld1rqw_f32_imm_dupqlane( %pred, <4 x float>* %addr) { +; CHECK-LABEL: ld1rqw_f32_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <4 x float>, <4 x float>* %addr, i32 1 + %load = load <4 x float>, <4 x float>* %ptr + %1 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) + ret %2 +} + +define @ld1rqw_f32_scalar_dupqlane( %pred, float* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqw_f32_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %idx + %ptr_bitcast = bitcast float* %ptr to <4 x float>* + %load = load <4 x float>, <4 x float>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) + ret %2 +} + ; ; LD1RQD ; @@ -211,6 +433,80 @@ ret %res } +define @ld1rqd_i64_scalar( %pred, i64* %base, i64 %idx) { +; CHECK-LABEL: ld1rqd_i64_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %base, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv2i64( %pred, i64* %ptr) + ret %res +} + +define @ld1rqd_f64_scalar( %pred, double* %base, i64 %idx) { +; CHECK-LABEL: ld1rqd_f64_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %base, i64 %idx + %res = call @llvm.aarch64.sve.ld1rq.nxv2f64( %pred, double* %ptr) + ret %res +} + +define @ld1rqd_i64_imm_dupqlane( %pred, <2 x i64>* %addr) { +; CHECK-LABEL: ld1rqd_i64_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <2 x i64>, <2 x i64>* %addr, i64 1 + %load = load <2 x i64>, <2 x i64>* %ptr + %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) + ret %2 +} + +define @ld1rqd_i64_scalar_dupqlane( %pred, i64* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqd_i64_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 %idx + %ptr_bitcast = bitcast i64* %ptr to <2 x i64>* + %load = load <2 x i64>, <2 x i64>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) + ret %2 +} + +define @ld1rqd_f64_imm_dupqlane( %pred, <2 x double>* %addr) { +; CHECK-LABEL: ld1rqd_f64_imm_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds <2 x double>, <2 x double>* %addr, i64 1 + %load = load <2 x double>, <2 x double>* %ptr + %1 = tail call @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) + ret %2 +} + +define @ld1rqd_f64_scalar_dupqlane( %pred, double* %addr, i64 %idx) { +; CHECK-LABEL: ld1rqd_f64_scalar_dupqlane: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 %idx + %ptr_bitcast = bitcast double* %ptr to <2 x double>* + %load = load <2 x double>, <2 x double>* %ptr_bitcast + %1 = tail call @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> %load, i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) + ret %2 +} + ; ; LDNT1B ; @@ -616,3 +912,21 @@ declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) + +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) +declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) + +declare @llvm.aarch64.sve.dupq.lane.nxv2i64(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2f64(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8bf16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) diff --git a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/A64FX/A64FX-sve-instructions.s @@ -3432,22 +3432,22 @@ # CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.d }, p7/z, [sp, #126] # CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.h }, p7/z, [sp, #126] # CHECK-NEXT: 1 11 0.50 * U ld1rh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 11 0.50 * U ld1rqb { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z21.b }, p5/z, [x10, #112] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z23.b }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqb { z31.b }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 11 0.50 * U ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 11 0.50 * ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3] # CHECK-NEXT: 1 11 0.50 * ld1rqd { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqd { z23.d }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqd { z23.d }, p3/z, [x13, #112] # CHECK-NEXT: 1 11 0.50 * ld1rqd { z31.d }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 11 0.50 * U ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 11 0.50 * ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1] # CHECK-NEXT: 1 11 0.50 * ld1rqh { z0.h }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqh { z23.h }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqh { z23.h }, p3/z, [x13, #112] # CHECK-NEXT: 1 11 0.50 * ld1rqh { z31.h }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 11 0.50 * U ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 11 0.50 * ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 11 0.50 * ld1rqw { z23.s }, p3/z, [x13, #112] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s @@ -4488,22 +4488,22 @@ # CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.d }, p7/z, [sp, #126] # CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.h }, p7/z, [sp, #126] # CHECK-NEXT: 1 6 0.33 * U ld1rh { z31.s }, p7/z, [sp, #126] -# CHECK-NEXT: 1 6 0.33 * U ld1rqb { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z0.b }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z21.b }, p5/z, [x10, #112] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z23.b }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqb { z31.b }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 6 0.33 * U ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 6 0.33 * ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3] # CHECK-NEXT: 1 6 0.33 * ld1rqd { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqd { z23.d }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqd { z23.d }, p3/z, [x13, #112] # CHECK-NEXT: 1 6 0.33 * ld1rqd { z31.d }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 6 0.33 * U ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 6 0.33 * ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1] # CHECK-NEXT: 1 6 0.33 * ld1rqh { z0.h }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqh { z23.h }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqh { z23.h }, p3/z, [x13, #112] # CHECK-NEXT: 1 6 0.33 * ld1rqh { z31.h }, p7/z, [sp, #-16] -# CHECK-NEXT: 1 6 0.33 * U ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 6 0.33 * ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #-128] # CHECK-NEXT: 1 6 0.33 * ld1rqw { z23.s }, p3/z, [x13, #112]