Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7043,7 +7043,22 @@
     // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
     // to make a vector of the same size as this SHUFFLE. We can ignore the
     // extract entirely, and canonicalise the concat using WidenVector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    unsigned VTEltBitWidth = VT.getScalarSizeInBits();
+    if (V1.getOpcode() == ISD::BITCAST &&
+        V1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        V1.getOperand(0).getScalarValueSizeInBits() % VTEltBitWidth == 0) {
+      // If the extract is bitcast to smaller type, offset the DUPLANE index to
+      // account for that and bitcast the DUPLANE operand.
+      SDValue SrcOp = V1.getOperand(0);
+      unsigned ExtIdx = SrcOp.getConstantOperandVal(1);
+      unsigned Scale = SrcOp.getScalarValueSizeInBits() / VTEltBitWidth;
+      Lane += ExtIdx * Scale;
+      unsigned WideVecNumElts =
+          SrcOp.getOperand(0).getValueType().getVectorNumElements();
+      MVT CastVT = MVT::getVectorVT(VT.getSimpleVT().getScalarType(),
+                                    WideVecNumElts * Scale);
+      V1 = DAG.getBitcast(CastVT, SrcOp.getOperand(0));
+    } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
       Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
       V1 = V1.getOperand(0);
     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1663,8 +1663,7 @@
 define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq3_f32_bitcast:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
 ; CHECK-NEXT:    ret
   %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
   %bc = bitcast <1 x double> %extract to <2 x float>
@@ -1676,8 +1675,7 @@
 define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq2_f32_bitcast:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[2]
 ; CHECK-NEXT:    ret
   %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
   %bc = bitcast <1 x double> %extract to <2 x float>
@@ -1689,8 +1687,7 @@
 define <4 x i16> @test_vmul_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq5_i16_bitcast:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    dup v1.4h, v1.h[1]
+; CHECK-NEXT:    dup v1.4h, v1.h[5]
 ; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ret
   %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>