diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9797,6 +9797,8 @@
   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
   SDValue Chain = MST->getChain();
+  SDValue Value = MST->getValue();
+  SDValue Ptr = MST->getBasePtr();
   SDLoc DL(N);
 
   // Zap masked stores with a zero mask.
@@ -9815,6 +9817,19 @@
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
+  // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
+  // truncating store.  We can do this even if this is already a truncstore.
+  if ((Value.getOpcode() == ISD::FP_ROUND ||
+       Value.getOpcode() == ISD::TRUNCATE) &&
+      Value.getNode()->hasOneUse() && MST->isUnindexed() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               MST->getMemoryVT(), LegalOperations)) {
+    return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                              MST->getOffset(), MST->getMask(),
+                              MST->getMemoryVT(), MST->getMemOperand(),
+                              MST->getAddressingMode(), /*IsTruncating=*/true);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18152,16 +18152,19 @@
 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
     SDValue Op, SelectionDAG &DAG) const {
   auto Store = cast<MaskedStoreSDNode>(Op);
-
-  if (Store->isTruncatingStore())
-    return SDValue();
+  SDValue Mask = Store->getMask();
 
   SDLoc DL(Op);
   EVT VT = Store->getValue().getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
 
+  if (Store->isTruncatingStore()) {
+    Mask = DAG.getNode(
+        ISD::ZERO_EXTEND, DL,
+        VT.changeVectorElementType(ContainerVT.getVectorElementType()), Mask);
+  }
+  Mask = convertFixedMaskToScalableVector(Mask, DAG);
   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
-  SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
 
   return DAG.getMaskedStore(
       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -151,14 +151,7 @@
 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
-; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
-; VBITS_GE_512-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
-; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].b, p{{[0-9]+}}/z, [[Z1]].b, #0
-; VBITS_GE_512-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
-; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
-; VBITS_GE_512-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
-; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, p[[P2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
@@ -173,15 +166,8 @@
 ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
-; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.h, vl8
-; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z1]].d, p[[P0]]/z, #-1
-; VBITS_GE_512-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
-; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
-; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].h, p{{[0-9]+}}/z, [[Z1]].h, #0
-; VBITS_GE_512-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
-; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
-; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, p[[P2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
+; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
@@ -196,13 +182,8 @@
 ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
-; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.s, vl8
-; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z1]].d, p[[P0]]/z, #-1
-; VBITS_GE_512-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
-; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].s, p{{[0-9]+}}/z, [[Z1]].s, #0
-; VBITS_GE_512-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
-; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, p[[P2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
+; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
@@ -217,15 +198,8 @@
 ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
-; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.b, vl16
-; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z1]].s, p[[P0]]/z, #-1
-; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
-; VBITS_GE_512-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
-; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].b, p{{[0-9]+}}/z, [[Z1]].b, #0
-; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
-; VBITS_GE_512-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
-; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, p[[P2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
+; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <16 x i32>, <16 x i32>* %ap
   %b = load <16 x i32>, <16 x i32>* %bp
@@ -240,13 +214,8 @@
 ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
-; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.h, vl16
-; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z1]].s, p[[P0]]/z, #-1
-; VBITS_GE_512-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
-; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].h, p{{[0-9]+}}/z, [[Z1]].h, #0
-; VBITS_GE_512-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
-; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, p[[P2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
+; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <16 x i32>, <16 x i32>* %ap
   %b = load <16 x i32>, <16 x i32>* %bp
@@ -261,13 +230,8 @@
 ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
-; VBITS_GE_512-DAG: ptrue p{{[0-9]+}}.b, vl32
-; VBITS_GE_512-DAG: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z1]].h, p[[P0]]/z, #-1
-; VBITS_GE_512-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
-; VBITS_GE_512-DAG: cmpne p[[P2:[0-9]+]].b, p{{[0-9]+}}/z, [[Z1]].b, #0
-; VBITS_GE_512-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
-; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, p[[P2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h
+; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <32 x i16>, <32 x i16>* %ap
   %b = load <32 x i16>, <32 x i16>* %bp
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -1459,6 +1459,7 @@
 ; CHECK-NEXT:    vldrht.s32 q3, [r1], #8
 ; CHECK-NEXT:    vmul.i32 q2, q3, q2
 ; CHECK-NEXT:    vqshrnb.s32 q2, q2, #15
+; CHECK-NEXT:    vmovlb.s16 q2, q2
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.32 q2, [r2], #8
 ; CHECK-NEXT:    le lr, .LBB8_2
@@ -2590,6 +2591,7 @@
 ; CHECK-NEXT:    vldrbt.s16 q6, [r1], #8
 ; CHECK-NEXT:    vmul.i16 q5, q6, q5
 ; CHECK-NEXT:    vqshrnb.s16 q5, q5, #7
+; CHECK-NEXT:    vmovlb.s8 q5, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrbt.16 q5, [r2], #8
 ; CHECK-NEXT:    le lr, .LBB17_2