diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -899,15 +899,15 @@
   defm LD1RQ_W      : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
   defm LD1RQ_D      : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
 
-  let AddedComplexity = 1 in {
-  class LD1RQPat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr, Instruction ptrue> :
-          Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
-          (load_instr (ptrue 31), GPR64sp:$Xn, 0)>;
-  }
-  def : LD1RQPat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>;
-  def : LD1RQPat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>;
-  def : LD1RQPat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>;
-  def : LD1RQPat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>;
+  defm : sve_ld1rq_imm_pat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>;
+  defm : sve_ld1rq_imm_pat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>;
+  defm : sve_ld1rq_imm_pat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>;
+  defm : sve_ld1rq_imm_pat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>;
+
+  def : sve_ld1rq_scalar_pat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B, PTRUE_B>;
+  def : sve_ld1rq_scalar_pat_shift<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H, PTRUE_H, 1>;
+  def : sve_ld1rq_scalar_pat_shift<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W, PTRUE_S, 2>;
+  def : sve_ld1rq_scalar_pat_shift<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D, PTRUE_D, 3>;
 
   // continuous load with reg+reg addressing.
   defm LD1B    : sve_mem_cld_ss<0b0000, "ld1b",  Z_b, ZPR8,  GPR64NoXZRshifted8>;
@@ -2241,6 +2241,15 @@
   def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
             (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
 
+  def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, GPR64:$idx))),
+            (LD1RQ_B $gp, $base, $idx)>;
+  def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (shl GPR64:$idx, (i64 1))))),
+            (LD1RQ_H $gp, $base, $idx)>;
+  def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (shl GPR64:$idx, (i64 2))))),
+            (LD1RQ_W $gp, $base, $idx)>;
+  def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (shl GPR64:$idx, (i64 3))))),
+            (LD1RQ_D $gp, $base, $idx)>;
+
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8),  (SXTB_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -8596,3 +8596,17 @@
   def : SVE_2_Op_Pred_All_Active_Pt<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
 }
 
+multiclass sve_ld1rq_imm_pat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr_imm, Instruction ptrue> {
+  def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
+        (load_instr_imm (ptrue 31), GPR64sp:$Xn, 0)>;
+  def : Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (add GPR64sp:$Xn, simm4s16:$imm))), (i64 0))), (i64 0))),
+        (load_instr_imm (ptrue 31), GPR64sp:$Xn, simm4s16:$imm)>;
+}
+
+class sve_ld1rq_scalar_pat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr_scalar, Instruction ptrue> :
+  Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (add GPR64sp:$Xn, GPR64:$idx))), (i64 0))), (i64 0))),
+      (load_instr_scalar (ptrue 31), GPR64sp:$Xn, $idx)>;
+
+class sve_ld1rq_scalar_pat_shift<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr_scalar, Instruction ptrue, int shift> :
+  Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load (add GPR64sp:$Xn, (shl GPR64:$idx, (i64 shift))))), (i64 0))), (i64 0))),
+      (load_instr_scalar (ptrue 31), GPR64sp:$Xn, $idx)>;
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -24,6 +24,16 @@
   ret <vscale x 16 x i8> %res
 }
 
+define <vscale x 16 x i8> @ld1rqb_i8_scalar(<vscale x 16 x i1> %pred, i8* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqb_i8_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
+  ret <vscale x 16 x i8> %res
+}
+
 define <vscale x 16 x i8> @ld1rqb_i8_imm_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld1rqb_i8_imm_lower_bound:
 ; CHECK:       // %bb.0:
@@ -47,8 +57,8 @@
 define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #129
-; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    mov x8, #-129
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %addr, i64 -129
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
@@ -58,14 +68,41 @@
 define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_upper_bound(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #113
-; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    mov w8, #113
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %addr, i64 113
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr)
   ret <vscale x 16 x i8> %res
 }
 
+define <vscale x 16 x i8> @ld1rqb_i8_imm_dupqlane(<vscale x 8 x i1> %pred, <16 x i8>* %addr) {
+; CHECK-LABEL: ld1rqb_i8_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <16 x i8>, <16 x i8>* %addr, i16 -1
+  %load = load <16 x i8>, <16 x i8>* %ptr
+  %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
+  %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 16 x i8> @ld1rqb_i8_scalar_dupqlane(<vscale x 8 x i1> %pred, i8* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqb_i8_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i8, i8* %addr, i64 %idx
+  %ptr_bitcast = bitcast i8* %ptr to <16 x i8>*
+  %load = load <16 x i8>, <16 x i8>* %ptr_bitcast
+  %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
+  %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
+  ret <vscale x 16 x i8> %2
+}
+
 ;
 ; LD1RQH
 ;
@@ -108,6 +145,26 @@
   ret <vscale x 8 x half> %res
 }
 
+define <vscale x 8 x i16> @ld1rqh_i16_scalar(<vscale x 8 x i1> %pred, i16* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_i16_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, i16* %ptr)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_scalar(<vscale x 8 x i1> %pred, half* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_f16_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds half, half* %addr, i64 %idx
+  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, half* %ptr)
+  ret <vscale x 8 x half> %res
+}
+
 define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
 ; CHECK-LABEL: ld1rqh_bf16:
 ; CHECK:       // %bb.0:
@@ -127,6 +184,97 @@
   ret <vscale x 8 x bfloat> %res
 }
 
+define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar(<vscale x 8 x i1> %pred, bfloat* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_bf16_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %ptr)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x i16> @ld1rqh_i16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x i16>* %addr) {
+; CHECK-LABEL: ld1rqh_i16_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <8 x i16>, <8 x i16>* %addr, i16 -1
+  %load = load <8 x i16>, <8 x i16>* %ptr
+  %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
+  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x i16> @ld1rqh_i16_scalar_dupqlane(<vscale x 8 x i1> %pred, i16* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_i16_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i16, i16* %addr, i64 %idx
+  %ptr_bitcast = bitcast i16* %ptr to <8 x i16>*
+  %load = load <8 x i16>, <8 x i16>* %ptr_bitcast
+  %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
+  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x half>* %addr) {
+; CHECK-LABEL: ld1rqh_f16_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <8 x half>, <8 x half>* %addr, i16 -1
+  %load = load <8 x half>, <8 x half>* %ptr
+  %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
+  %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @ld1rqh_f16_scalar_dupqlane(<vscale x 8 x i1> %pred, half* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_f16_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds half, half* %addr, i64 %idx
+  %ptr_bitcast = bitcast half* %ptr to <8 x half>*
+  %load = load <8 x half>, <8 x half>* %ptr_bitcast
+  %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
+  %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_imm_dupqlane(<vscale x 8 x i1> %pred, <8 x bfloat>* %addr) {
+; CHECK-LABEL: ld1rqh_bf16_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <8 x bfloat>, <8 x bfloat>* %addr, i16 -1
+  %load = load <8 x bfloat>, <8 x bfloat>* %ptr
+  %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
+  %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
+  ret <vscale x 8 x bfloat> %2
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar_dupqlane(<vscale x 8 x i1> %pred, bfloat* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqh_bf16_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %idx
+  %ptr_bitcast = bitcast bfloat* %ptr to <8 x bfloat>*
+  %load = load <8 x bfloat>, <8 x bfloat>* %ptr_bitcast
+  %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
+  %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
+  ret <vscale x 8 x bfloat> %2
+}
+
 ;
 ; LD1RQW
 ;
@@ -169,6 +317,80 @@
   ret <vscale x 4 x float> %res
 }
 
+define <vscale x 4 x i32> @ld1rqw_i32_scalar(<vscale x 4 x i1> %pred, i32* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqw_i32_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i32, i32* %base, i64 %idx
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, i32* %ptr)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_scalar(<vscale x 4 x i1> %pred, float* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqw_f32_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds float, float* %base, i64 %idx
+  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, float* %ptr)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 4 x i32> @ld1rqw_i32_imm_dupqlane(<vscale x 4 x i1> %pred, <4 x i32>* %addr) {
+; CHECK-LABEL: ld1rqw_i32_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %addr, i32 1
+  %load = load <4 x i32>, <4 x i32>* %ptr
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @ld1rqw_i32_scalar_dupqlane(<vscale x 4 x i1> %pred, i32* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqw_i32_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i32, i32* %addr, i64 %idx
+  %ptr_bitcast = bitcast i32* %ptr to <4 x i32>*
+  %load = load <4 x i32>, <4 x i32>* %ptr_bitcast
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_imm_dupqlane(<vscale x 4 x i1> %pred, <4 x float>* %addr) {
+; CHECK-LABEL: ld1rqw_f32_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <4 x float>, <4 x float>* %addr, i32 1
+  %load = load <4 x float>, <4 x float>* %ptr
+  %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
+  %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 4 x float> @ld1rqw_f32_scalar_dupqlane(<vscale x 4 x i1> %pred, float* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqw_f32_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds float, float* %addr, i64 %idx
+  %ptr_bitcast = bitcast float* %ptr to <4 x float>*
+  %load = load <4 x float>, <4 x float>* %ptr_bitcast
+  %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
+  %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
+  ret <vscale x 4 x float> %2
+}
+
 ;
 ; LD1RQD
 ;
@@ -211,6 +433,80 @@
   ret <vscale x 2 x double> %res
 }
 
+define <vscale x 2 x i64> @ld1rqd_i64_scalar(<vscale x 2 x i1> %pred, i64* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqd_i64_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i64, i64* %base, i64 %idx
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, i64* %ptr)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_scalar(<vscale x 2 x i1> %pred, double* %base, i64 %idx) {
+; CHECK-LABEL: ld1rqd_f64_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds double, double* %base, i64 %idx
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, double* %ptr)
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 2 x i64> @ld1rqd_i64_imm_dupqlane(<vscale x 2 x i1> %pred, <2 x i64>* %addr) {
+; CHECK-LABEL: ld1rqd_i64_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <2 x i64>, <2 x i64>* %addr, i64 1
+  %load = load <2 x i64>, <2 x i64>* %ptr
+  %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @ld1rqd_i64_scalar_dupqlane(<vscale x 2 x i1> %pred, i64* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqd_i64_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds i64, i64* %addr, i64 %idx
+  %ptr_bitcast = bitcast i64* %ptr to <2 x i64>*
+  %load = load <2 x i64>, <2 x i64>* %ptr_bitcast
+  %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_imm_dupqlane(<vscale x 2 x i1> %pred, <2 x double>* %addr) {
+; CHECK-LABEL: ld1rqd_f64_imm_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #16]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds <2 x double>, <2 x double>* %addr, i64 1
+  %load = load <2 x double>, <2 x double>* %ptr
+  %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
+  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @ld1rqd_f64_scalar_dupqlane(<vscale x 2 x i1> %pred, double* %addr, i64 %idx) {
+; CHECK-LABEL: ld1rqd_f64_scalar_dupqlane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %ptr = getelementptr inbounds double, double* %addr, i64 %idx
+  %ptr_bitcast = bitcast double* %ptr to <2 x double>*
+  %load = load <2 x double>, <2 x double>* %ptr_bitcast
+  %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
+  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
+  ret <vscale x 2 x double> %2
+}
+
 ;
 ; LDNT1B
 ;
@@ -616,3 +912,21 @@
 declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
 declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
 declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -5504,22 +5504,22 @@
 # CHECK-NEXT:  1      6     0.33    *             U     ld1rh	{ z31.d }, p7/z, [sp, #126]
 # CHECK-NEXT:  1      6     0.33    *             U     ld1rh	{ z31.h }, p7/z, [sp, #126]
 # CHECK-NEXT:  1      6     0.33    *             U     ld1rh	{ z31.s }, p7/z, [sp, #126]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqb	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z0.b }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z0.b }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z21.b }, p5/z, [x10, #112]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z23.b }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqb	{ z31.b }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z23.d }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z23.d }, p3/z, [x13, #112]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqd	{ z31.d }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z0.h }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z23.h }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z23.h }, p3/z, [x13, #112]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqh	{ z31.h }, p7/z, [sp, #-16]
-# CHECK-NEXT:  1      6     0.33    *             U     ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z23.s }, p3/z, [x13, #-128]
 # CHECK-NEXT:  1      6     0.33    *                   ld1rqw	{ z23.s }, p3/z, [x13, #112]