diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23720,10 +23720,6 @@
       continue;
     }
 
-    // Profitability check: only deal with extractions from the first subvector.
-    if (OpSubvecIdx != 0)
-      return SDValue();
-
     const std::pair<SDValue, int> DemandedSubvector =
         std::make_pair(Op, OpSubvecIdx);
 
@@ -23753,6 +23749,14 @@
   if (DemandedSubvectors.empty())
     return DAG.getUNDEF(NarrowVT);
 
+  // Profitability check: only deal with extractions from the first subvector
+  // unless the mask becomes an identity mask.
+  if (!ShuffleVectorInst::isIdentityMask(NewMask) ||
+      any_of(NewMask, [](int M) { return M < 0; }))
+    for (auto &DemandedSubvector : DemandedSubvectors)
+      if (DemandedSubvector.second != 0)
+        return SDValue();
+
   // We still perform the exact same EXTRACT_SUBVECTOR,  just on different
   // operand[s]/index[es], so there is no point in checking for it's legality.
 
diff --git a/llvm/test/CodeGen/AArch64/add-extract.ll b/llvm/test/CodeGen/AArch64/add-extract.ll
--- a/llvm/test/CodeGen/AArch64/add-extract.ll
+++ b/llvm/test/CodeGen/AArch64/add-extract.ll
@@ -83,9 +83,9 @@
 define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: add_i64_ext_ext_test1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    add d0, d0, d1
-; CHECK-NEXT:    dup v1.2d, v1.d[1]
-; CHECK-NEXT:    add d0, d0, d1
+; CHECK-NEXT:    add d0, d0, d2
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %a = extractelement <1 x i64> %A, i32 0
@@ -99,9 +99,9 @@
 define i64 @sub_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: sub_i64_ext_ext_test1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    sub d0, d0, d1
-; CHECK-NEXT:    dup v1.2d, v1.d[1]
-; CHECK-NEXT:    sub d0, d0, d1
+; CHECK-NEXT:    sub d0, d0, d2
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %a = extractelement <1 x i64> %A, i32 0
diff --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@@ -262,3 +262,16 @@
   %r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 2, i32 7, i32 2, i32 0, i32 3, i32 2, i32 15>
   ret <8 x half> %r
 }
+
+define <4 x i32> @extract_shuffle(<8 x i16> %j, <4 x i16> %k) {
+; CHECK-LABEL: extract_shuffle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #3
+; CHECK-NEXT:    ret
+  %a = shufflevector <8 x i16> %j, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %b = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %c = zext <4 x i16> %b to <4 x i32>
+  %d = shl <4 x i32> %c, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %d
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -56,36 +56,36 @@
 ; CHECK-NEXT:    mov v1.b[5], w10
 ; CHECK-NEXT:    umov w10, v0.b[14]
 ; CHECK-NEXT:    mov v2.b[5], w8
-; CHECK-NEXT:    mov x8, #16
+; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    mov v1.b[6], w9
-; CHECK-NEXT:    mov x9, #24
+; CHECK-NEXT:    mov x9, #24 // =0x18
 ; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    mov v2.b[6], w10
 ; CHECK-NEXT:    umov w10, v0.b[15]
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT:    dup v3.2d, v0.d[1]
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    mov v1.b[7], w11
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    mov v2.b[7], w10
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
+; CHECK-NEXT:    mov x11, #8 // =0x8
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    and z0.s, z0.s, #0x1
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    mov x11, #8
-; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    lsl z3.s, z3.s, #31
-; CHECK-NEXT:    asr z0.s, z0.s, #31
-; CHECK-NEXT:    asr z3.s, z3.s, #31
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    asr z0.s, z3.s, #31
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
-; CHECK-NEXT:    and z3.s, z3.s, #0x1
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x11, lsl #2]
-; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
+; CHECK-NEXT:    cmpne p2.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    lsl z2.s, z2.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
@@ -96,12 +96,12 @@
 ; CHECK-NEXT:    mov z5.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z2.s, #0
-; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z5.s }, p0, [x0, x9, lsl #2]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x11, lsl #2]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x0]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x0, x11, lsl #2]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer