Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1282,6 +1282,16 @@
   : VectorIndex<i64, SVEVectorIndexExtDupQOperand,
                 [{ return ((uint64_t)Imm) < 4; }]>;
 
+defm sve_elm_idx_extdup_32b
+  : VectorIndex<i32, SVEVectorIndexExtDupBOperand,
+                [{ return ((uint64_t)Imm) < 64; }]>;
+defm sve_elm_idx_extdup_32h
+  : VectorIndex<i32, SVEVectorIndexExtDupHOperand,
+                [{ return ((uint64_t)Imm) < 32; }]>;
+defm sve_elm_idx_extdup_32s
+  : VectorIndex<i32, SVEVectorIndexExtDupSOperand,
+                [{ return ((uint64_t)Imm) < 16; }]>;
+
 // 8-bit immediate for AdvSIMD where 64-bit values of the form:
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
 // are encoded as the eight bit value 'abcdefgh'.
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -607,6 +607,24 @@
               (FDUP_ZI_D fpimm64:$imm8)>;
   }
 
+  // Simplify DUP + TBL to DUPLane
+  def : Pat<(nxv16i8 (AArch64tbl (nxv16i8 ZPR:$vec), (nxv16i8 (AArch64dup sve_elm_idx_extdup_32b:$index)))),
+            (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_32b:$index)>;
+  def : Pat<(nxv8i16 (AArch64tbl (nxv8i16 ZPR:$vec), (nxv8i16 (AArch64dup sve_elm_idx_extdup_32h:$index)))),
+            (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_32h:$index)>;
+  def : Pat<(nxv4i32 (AArch64tbl (nxv4i32 ZPR:$vec), (nxv4i32 (AArch64dup sve_elm_idx_extdup_32s:$index)))),
+            (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_32s:$index)>;
+  def : Pat<(nxv2i64 (AArch64tbl (nxv2i64 ZPR:$vec), (nxv2i64 (AArch64dup sve_elm_idx_extdup_d:$index)))),
+            (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+  def : Pat<(nxv8f16 (AArch64tbl (nxv8f16 ZPR:$vec), (nxv8i16 (AArch64dup sve_elm_idx_extdup_32h:$index)))),
+            (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_32h:$index)>;
+  def : Pat<(nxv8bf16 (AArch64tbl (nxv8bf16 ZPR:$vec), (nxv8i16 (AArch64dup sve_elm_idx_extdup_32h:$index)))),
+            (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_32h:$index)>;
+  def : Pat<(nxv4f32 (AArch64tbl (nxv4f32 ZPR:$vec), (nxv4i32 (AArch64dup sve_elm_idx_extdup_32s:$index)))),
+            (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_32s:$index)>;
+  def : Pat<(nxv2f64 (AArch64tbl (nxv2f64 ZPR:$vec), (nxv2i64 (AArch64dup sve_elm_idx_extdup_d:$index)))),
+            (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+
   // Select elements from either vector (predicated)
   defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
@@ -2279,23 +2297,23 @@
 
   // Extract element from vector with immediate index
   def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, (trunc_imm sve_elm_idx_extdup_b:$index)), ssub)>;
   def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, (trunc_imm sve_elm_idx_extdup_h:$index)), ssub)>;
   def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, (trunc_imm sve_elm_idx_extdup_s:$index)), ssub)>;
   def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
   def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, (trunc_imm sve_elm_idx_extdup_h:$index)), hsub)>;
   def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, (trunc_imm sve_elm_idx_extdup_h:$index)), hsub)>;
   def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, (trunc_imm sve_elm_idx_extdup_s:$index)), hsub)>;
   def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
   def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
-            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, (trunc_imm sve_elm_idx_extdup_s:$index)), ssub)>;
   def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>;
   def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -979,15 +979,15 @@
 }
 
 multiclass sve_int_perm_dup_i<string asm> {
-  def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
+  def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_32b, asm, ZPR8> {
     let Inst{23-22} = idx{5-4};
     let Inst{20-17} = idx{3-0};
   }
-  def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
+  def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_32h, asm, ZPR16> {
     let Inst{23-22} = idx{4-3};
     let Inst{20-18} = idx{2-0};
   }
-  def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
+  def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_32s, asm, ZPR32> {
     let Inst{23-22} = idx{3-2};
     let Inst{20-19}    = idx{1-0};
   }
@@ -1000,11 +1000,11 @@
   }
 
   def : InstAlias<"mov $Zd, $Zn$idx",
-                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_32b:$idx), 1>;
   def : InstAlias<"mov $Zd, $Zn$idx",
-                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_32h:$idx), 1>;
   def : InstAlias<"mov $Zd, $Zn$idx",
-                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_32s:$idx), 1>;
   def : InstAlias<"mov $Zd, $Zn$idx",
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
   def : InstAlias<"mov $Zd, $Zn$idx",
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@@ -8,7 +8,7 @@
 ; Unpredicated dup instruction (which is an alias for mov):
 ;   * register + register,
 ;   * register + immediate
-;
+;   * register + vector element index
 
 define <vscale x 16 x i8> @dup_i8(i8 %b) {
 ; CHECK-LABEL: dup_i8:
@@ -154,6 +154,69 @@
   ret <vscale x 2 x double> %out
 }
 
+define <vscale x 16 x i8> @dup_ext_i8(<vscale x 16 x i8> %data) {
+; CHECK-LABEL: dup_ext_i8:
+; CHECK: mov z0.b, z0.b[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 1)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.tbl.nxv16i8(<vscale x 16 x i8>  %data, <vscale x 16 x i8> %tmp)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @dup_ext_i16(<vscale x 8 x i16> %data) {
+; CHECK-LABEL: dup_ext_i16:
+; CHECK: mov z0.h, z0.h[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.tbl.nxv8i16(<vscale x 8 x i16>  %data, <vscale x 8 x i16> %tmp)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @dup_ext_i32(<vscale x 4 x i32> %data) {
+; CHECK-LABEL: dup_ext_i32:
+; CHECK: mov z0.s, z0.s[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.tbl.nxv4i32(<vscale x 4 x i32>  %data, <vscale x 4 x i32> %tmp)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @dup_ext_i64(<vscale x 2 x i64> %data) {
+; CHECK-LABEL: dup_ext_i64:
+; CHECK: mov z0.d, z0.d[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.tbl.nxv2i64(<vscale x 2 x i64>  %data, <vscale x 2 x i64> %tmp)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x half> @dup_ext_f16(<vscale x 8 x half> %data) {
+; CHECK-LABEL: dup_ext_f16:
+; CHECK: mov z0.h, z0.h[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.tbl.nxv8f16(<vscale x 8 x half>  %data, <vscale x 8 x i16> %tmp)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @dup_ext_f32(<vscale x 4 x float> %data) {
+; CHECK-LABEL: dup_ext_f32:
+; CHECK: mov z0.s, z0.s[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.tbl.nxv4f32(<vscale x 4 x float>  %data, <vscale x 4 x i32> %tmp)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @dup_ext_f64(<vscale x 2 x double> %data) {
+; CHECK-LABEL: dup_ext_f64:
+; CHECK: mov z0.d, z0.d[1]
+; CHECK-NEXT: ret
+  %tmp = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.tbl.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i64> %tmp)
+  ret <vscale x 2 x double> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8( i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
@@ -163,6 +226,14 @@
 declare <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.tbl.nxv16i8( <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.tbl.nxv8i16( <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.tbl.nxv4i32( <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.tbl.nxv2i64( <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.tbl.nxv8f16( <vscale x 8 x half>, <vscale x 8 x i16>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16( <vscale x 8 x bfloat>, <vscale x 8 x i16>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.tbl.nxv4f32( <vscale x 4 x float>, <vscale x 4 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tbl.nxv2f64( <vscale x 2 x double>, <vscale x 2 x i64>)
 
 ; +bf16 is required for the bfloat version.
 attributes #0 = { "target-features"="+sve,+bf16" }
Index: llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
@@ -29,8 +29,8 @@
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    add x8, x0, x2, lsl #3
-; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    ret
   %load = load double, double* %a
   %dup = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double %load)