diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10572,6 +10572,24 @@
 
     return true;
   }
+  case Instruction::Mul: {
+
+    for (auto OpIdx : enumerate(I->operands())) {
+      Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
+      // Make sure we are not already sinking this operand
+      if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+        continue;
+
+      Instruction *Shuffle = Op;
+
+      // TODO Why can this sometimes fail?
+      if(isa<Instruction>(&Shuffle->getOperandUse(0)))
+        Ops.push_back(&Shuffle->getOperandUse(0));
+      Ops.push_back(&OpIdx.value());
+    }
+
+    return true;
+  }
   default:
     return false;
   }
@@ -14966,6 +14984,50 @@
   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
 }
 
+static SDValue performDUPSextCombine(SDNode *N, SelectionDAG &DAG, bool Signed) {
+  auto Operand = N->getOperand(0);
+  auto ExtOperand = Operand.getOperand(0);
+
+  auto NType = N->getValueType(0);
+
+  // Cannot operate on non-vector dups
+  if(!NType.isVector())
+    return SDValue();
+
+
+  auto VectorElementType = NType.getVectorElementType();
+
+  // Cannot operate on non-integers
+  if(!VectorElementType.isInteger())
+    return SDValue();
+
+
+  TypeSize Size = VectorElementType.getSizeInBits();
+
+  // Cannot operate on a result size of 8 bits
+  if(Size == 8)
+    return SDValue();
+
+  EVT HalfType = MVT::getIntegerVT(Size >> 1);
+  EVT HalfVT = NType.getVectorVT(*DAG.getContext(), HalfType, NType.getVectorNumElements());
+
+  auto DupNode = DAG.getNode(N->getOpcode(), SDLoc(N), DAG.getVTList(HalfVT), {ExtOperand});
+  auto ExtNode = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, SDLoc(Operand.getNode()), DAG.getVTList(NType), {DupNode, DAG.getValueType(NType)});
+
+  return ExtNode; // (sext/zext(dup(...)))
+}
+
+static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+  auto Operand = N->getOperand(0);
+  auto Opcode = Operand.getOpcode();
+  bool IsSext = Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_INREG || Opcode == ISD::AssertSext;
+  if(IsSext || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::AssertZext) {
+    return performDUPSextCombine(N, DAG, IsSext);
+  }
+
+  return performPostLD1Combine(N, DCI, false);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -15026,7 +15088,7 @@
   case AArch64ISD::CSEL:
     return performCONDCombine(N, DCI, DAG, 2, 3);
   case AArch64ISD::DUP:
-    return performPostLD1Combine(N, DCI, false);
+    return performDUPCombine(N, DCI, DAG);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
   case AArch64ISD::UZP1:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-smull.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-smull.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
+
+define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
+; CHECK-LABEL: matrix_mul_signed:
+; CHECK:       // %bb.0: // %vector.header
+; CHECK-NEXT:    sxth w9, w3
+; CHECK-NEXT:    //
+; CHECK-NEXT:    and x8, x0, #0xfffffff8
+; CHECK-NEXT:    dup v0.4h, w9
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    //
+; CHECK-NEXT:    add x9, x2, w0, sxtw #1
+; CHECK-NEXT:    ldp d1, d2, [x9]
+; CHECK-NEXT:    add x9, x1, w0, sxtw #2
+; CHECK-NEXT:    subs x8, x8, #8
+; CHECK-NEXT:    add w0, w0, #8
+; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT:    stp q1, q2, [x9]
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.end12
+; CHECK-NEXT:    ret
+vector.header:
+  %conv4 = sext i16 %val to i32
+  %wide.trip.count = sext i32 %N to i64
+  %0 = add nsw i64 %wide.trip.count, -1
+  %min.iters.check = icmp ult i32 %N, 8
+  %1 = trunc i64 %0 to i32
+  %2 = icmp ugt i64 %0, 4294967295
+  %n.vec = and i64 %wide.trip.count, 4294967288
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
+  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.header, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
+  %3 = trunc i64 %index to i32
+  %4 = add i32 %N, %3
+  %5 = sext i32 %4 to i64
+  %6 = getelementptr inbounds i16, i16* %A, i64 %5
+  %7 = bitcast i16* %6 to <4 x i16>*
+  %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
+  %8 = getelementptr inbounds i16, i16* %6, i64 4
+  %9 = bitcast i16* %8 to <4 x i16>*
+  %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
+  %10 = sext <4 x i16> %wide.load to <4 x i32>
+  %11 = sext <4 x i16> %wide.load30 to <4 x i32>
+  %12 = mul nsw <4 x i32> %broadcast.splat, %10
+  %13 = mul nsw <4 x i32> %broadcast.splat32, %11
+  %14 = getelementptr inbounds i32, i32* %C, i64 %5
+  %15 = bitcast i32* %14 to <4 x i32>*
+  store <4 x i32> %12, <4 x i32>* %15, align 4
+  %16 = getelementptr inbounds i32, i32* %14, i64 4
+  %17 = bitcast i32* %16 to <4 x i32>*
+  store <4 x i32> %13, <4 x i32>* %17, align 4
+  %index.next = add i64 %index, 8
+  %18 = icmp eq i64 %index.next, %n.vec
+  br i1 %18, label %for.end12, label %vector.body
+
+for.end12:                                        ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
+
+define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
+; CHECK-LABEL: matrix_mul_unsigned:
+; CHECK:       // %bb.0: // %vector.header
+; CHECK-NEXT:    and w9, w3, #0xffff
+; CHECK-NEXT:    //
+; CHECK-NEXT:    and x8, x0, #0xfffffff8
+; CHECK-NEXT:    dup v0.4h, w9
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    //
+; CHECK-NEXT:    add x9, x2, w0, uxtw #1
+; CHECK-NEXT:    ldp d1, d2, [x9]
+; CHECK-NEXT:    add x9, x1, w0, uxtw #2
+; CHECK-NEXT:    subs x8, x8, #8
+; CHECK-NEXT:    add w0, w0, #8
+; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT:    stp q1, q2, [x9]
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.end12
+; CHECK-NEXT:    ret
+vector.header:
+  %conv4 = zext i16 %val to i32
+  %wide.trip.count = zext i32 %N to i64
+  %0 = add nsw i64 %wide.trip.count, -1
+  %min.iters.check = icmp ult i32 %N, 8
+  %1 = trunc i64 %0 to i32
+  %2 = icmp ugt i64 %0, 4294967295
+  %n.vec = and i64 %wide.trip.count, 4294967288
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
+  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.header, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
+  %3 = trunc i64 %index to i32
+  %4 = add i32 %N, %3
+  %5 = zext i32 %4 to i64
+  %6 = getelementptr inbounds i16, i16* %A, i64 %5
+  %7 = bitcast i16* %6 to <4 x i16>*
+  %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
+  %8 = getelementptr inbounds i16, i16* %6, i64 4
+  %9 = bitcast i16* %8 to <4 x i16>*
+  %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
+  %10 = zext <4 x i16> %wide.load to <4 x i32>
+  %11 = zext <4 x i16> %wide.load30 to <4 x i32>
+  %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
+  %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
+  %14 = getelementptr inbounds i32, i32* %C, i64 %5
+  %15 = bitcast i32* %14 to <4 x i32>*
+  store <4 x i32> %12, <4 x i32>* %15, align 4
+  %16 = getelementptr inbounds i32, i32* %14, i64 4
+  %17 = bitcast i32* %16 to <4 x i32>*
+  store <4 x i32> %13, <4 x i32>* %17, align 4
+  %index.next = add i64 %index, 8
+  %18 = icmp eq i64 %index.next, %n.vec
+  br i1 %18, label %for.end12, label %vector.body
+
+for.end12:                                        ; preds = %vector.body
+  ret void
+}