Index: llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -191,6 +191,8 @@
     setOperationAction(ISD::SHL, VT, Custom);
     setOperationAction(ISD::SRA, VT, Custom);
     setOperationAction(ISD::SRL, VT, Custom);
+    setOperationAction(ISD::AND, VT, Custom);
+    setOperationAction(ISD::OR, VT, Custom);
   }
 
   // Neon does not support vector divide/remainder operations.
@@ -254,6 +256,8 @@
     setOperationAction(ISD::SHL, VT, Custom);
     setOperationAction(ISD::SRA, VT, Custom);
     setOperationAction(ISD::SRL, VT, Custom);
+    setOperationAction(ISD::AND, VT, Custom);
+    setOperationAction(ISD::OR, VT, Custom);
     setOperationAction(ISD::SMIN, VT, Legal);
     setOperationAction(ISD::SMAX, VT, Legal);
     setOperationAction(ISD::UMIN, VT, Legal);
@@ -4966,6 +4970,216 @@
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
 }
 
+/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON or MVE instruction with a "modified
+/// immediate" operand (e.g., VMOV).  If so, return the encoded value.
+static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 const SDLoc &dl, EVT &VT, EVT VectorVT,
+                                 VMOVModImmType type) {
+  unsigned OpCmode, Imm;
+  bool is128Bits = VectorVT.is128BitVector();
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
+  switch (SplatBitSize) {
+  case 8:
+    if (type != VMOVModImm)
+      return SDValue();
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+    break;
+
+  case 16:
+    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      OpCmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      OpCmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
+
+  case 32:
+    // NEON's 32-bit VMOV supports splat values where:
+    // * only one byte is nonzero, or
+    // * the least significant byte is 0xff and the second byte is nonzero, or
+    // * the least significant 2 bytes are 0xff and the third is nonzero.
+    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      OpCmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      OpCmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      OpCmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      OpCmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
+
+    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+    if (type == OtherModImm) return SDValue();
+
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      OpCmode = 0xc;
+      Imm = SplatBits >> 8;
+      break;
+    }
+
+    // cmode == 0b1101 is not supported for MVE VMVN
+    if (type == MVEVMVNModImm)
+      return SDValue();
+
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      OpCmode = 0xd;
+      Imm = SplatBits >> 16;
+      break;
+    }
+
+    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+    // VMOV.I32.  A (very) minor optimization would be to replicate the value
+    // and fall through here to test for a valid 64-bit splat.  But, then the
+    // caller would also need to check and handle the change in size.
+    return SDValue();
+
+  case 64: {
+    if (type != VMOVModImm)
+      return SDValue();
+    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    uint64_t BitMask = 0xff;
+    unsigned ImmMask = 1;
+    Imm = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
+        return SDValue();
+      }
+      BitMask <<= 8;
+      ImmMask <<= 1;
+    }
+
+    if (DAG.getDataLayout().isBigEndian()) {
+      // Reverse the order of elements within the vector.
+      unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
+      unsigned Mask = (1 << BytesPerElem) - 1;
+      unsigned NumElems = 8 / BytesPerElem;
+      unsigned NewImm = 0;
+      for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
+        unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
+        NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
+      }
+      Imm = NewImm;
+    }
+
+    // Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+    break;
+  }
+
+  default:
+    llvm_unreachable("unexpected size for isVMOVModifiedImm");
+  }
+
+  unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
+  return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
+}
+
+// Custom lower AND(X, C) -> VBICIMM
+static SDValue LowerAND(SDValue Op, SelectionDAG &DAG,
+                        const ARMSubtarget *Subtarget) {
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
+        SplatBitSize == 64) {
+      EVT VBICVT;
+      SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, DL, VBICVT, VT, OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input = DAG.getNode(ISD::BITCAST, DL, VBICVT, Op.getOperand(0));
+        SDValue VBIC = DAG.getNode(ARMISD::VBICIMM, DL, VBICVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, DL, VT, VBIC);
+      }
+    }
+  }
+
+  return Op;
+}
+
+// Custom lower AND(X, C) -> VORRIMM
+static SDValue LowerOR(SDValue Op, SelectionDAG &DAG,
+                        const ARMSubtarget *Subtarget) {
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
+        SplatBitSize == 64) {
+      EVT VorrVT;
+      SDValue Val =
+          isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                            SplatBitSize, DAG, DL, VorrVT, VT, OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, DL, VorrVT, Op.getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, DL, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, DL, VT, Vorr);
+      }
+    }
+  }
+
+  return Op;
+}
+
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
@@ -6745,157 +6959,6 @@
                      CCR, Chain.getValue(1));
 }
 
-/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
-/// valid vector constant for a NEON or MVE instruction with a "modified
-/// immediate" operand (e.g., VMOV).  If so, return the encoded value.
-static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
-                                 unsigned SplatBitSize, SelectionDAG &DAG,
-                                 const SDLoc &dl, EVT &VT, EVT VectorVT,
-                                 VMOVModImmType type) {
-  unsigned OpCmode, Imm;
-  bool is128Bits = VectorVT.is128BitVector();
-
-  // SplatBitSize is set to the smallest size that splats the vector, so a
-  // zero vector will always have SplatBitSize == 8.  However, NEON modified
-  // immediate instructions others than VMOV do not support the 8-bit encoding
-  // of a zero vector, and the default encoding of zero is supposed to be the
-  // 32-bit version.
-  if (SplatBits == 0)
-    SplatBitSize = 32;
-
-  switch (SplatBitSize) {
-  case 8:
-    if (type != VMOVModImm)
-      return SDValue();
-    // Any 1-byte value is OK.  Op=0, Cmode=1110.
-    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    OpCmode = 0xe;
-    Imm = SplatBits;
-    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
-    break;
-
-  case 16:
-    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
-    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x00nn: Op=x, Cmode=100x.
-      OpCmode = 0x8;
-      Imm = SplatBits;
-      break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0xnn00: Op=x, Cmode=101x.
-      OpCmode = 0xa;
-      Imm = SplatBits >> 8;
-      break;
-    }
-    return SDValue();
-
-  case 32:
-    // NEON's 32-bit VMOV supports splat values where:
-    // * only one byte is nonzero, or
-    // * the least significant byte is 0xff and the second byte is nonzero, or
-    // * the least significant 2 bytes are 0xff and the third is nonzero.
-    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x000000nn: Op=x, Cmode=000x.
-      OpCmode = 0;
-      Imm = SplatBits;
-      break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0x0000nn00: Op=x, Cmode=001x.
-      OpCmode = 0x2;
-      Imm = SplatBits >> 8;
-      break;
-    }
-    if ((SplatBits & ~0xff0000) == 0) {
-      // Value = 0x00nn0000: Op=x, Cmode=010x.
-      OpCmode = 0x4;
-      Imm = SplatBits >> 16;
-      break;
-    }
-    if ((SplatBits & ~0xff000000) == 0) {
-      // Value = 0xnn000000: Op=x, Cmode=011x.
-      OpCmode = 0x6;
-      Imm = SplatBits >> 24;
-      break;
-    }
-
-    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
-    if (type == OtherModImm) return SDValue();
-
-    if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
-      // Value = 0x0000nnff: Op=x, Cmode=1100.
-      OpCmode = 0xc;
-      Imm = SplatBits >> 8;
-      break;
-    }
-
-    // cmode == 0b1101 is not supported for MVE VMVN
-    if (type == MVEVMVNModImm)
-      return SDValue();
-
-    if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
-      // Value = 0x00nnffff: Op=x, Cmode=1101.
-      OpCmode = 0xd;
-      Imm = SplatBits >> 16;
-      break;
-    }
-
-    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
-    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
-    // VMOV.I32.  A (very) minor optimization would be to replicate the value
-    // and fall through here to test for a valid 64-bit splat.  But, then the
-    // caller would also need to check and handle the change in size.
-    return SDValue();
-
-  case 64: {
-    if (type != VMOVModImm)
-      return SDValue();
-    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
-    uint64_t BitMask = 0xff;
-    unsigned ImmMask = 1;
-    Imm = 0;
-    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
-        Imm |= ImmMask;
-      } else if ((SplatBits & BitMask) != 0) {
-        return SDValue();
-      }
-      BitMask <<= 8;
-      ImmMask <<= 1;
-    }
-
-    if (DAG.getDataLayout().isBigEndian()) {
-      // Reverse the order of elements within the vector.
-      unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
-      unsigned Mask = (1 << BytesPerElem) - 1;
-      unsigned NumElems = 8 / BytesPerElem;
-      unsigned NewImm = 0;
-      for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
-        unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
-        NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
-      }
-      Imm = NewImm;
-    }
-
-    // Op=1, Cmode=1110.
-    OpCmode = 0x1e;
-    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
-    break;
-  }
-
-  default:
-    llvm_unreachable("unexpected size for isVMOVModifiedImm");
-  }
-
-  unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
-  return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
-}
-
 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *ST) const {
   EVT VT = Op.getValueType();
@@ -10119,6 +10182,10 @@
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
     return LowerSADDSUBSAT(Op, DAG, Subtarget);
+  case ISD::AND:
+    return LowerAND(Op, DAG, Subtarget);
+  case ISD::OR:
+    return LowerOR(Op, DAG, Subtarget);
   case ISD::LOAD:
     return LowerPredicateLoad(Op, DAG);
   case ISD::STORE:
@@ -13567,8 +13634,6 @@
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
-  // Attempt to use immediate-form VBIC
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
@@ -13577,26 +13642,6 @@
       VT == MVT::v8i1 || VT == MVT::v16i1)
     return SDValue();
 
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
-      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
-        SplatBitSize == 64) {
-      EVT VbicVT;
-      SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
-                                      SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, dl, VbicVT, VT, OtherModImm);
-      if (Val.getNode()) {
-        SDValue Input =
-          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
-        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
-        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
-      }
-    }
-  }
-
   if (!Subtarget->isThumb1Only()) {
     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
@@ -13861,8 +13906,6 @@
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *Subtarget) {
-  // Attempt to use immediate-form VORR
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
@@ -13874,26 +13917,6 @@
       (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
     return PerformORCombine_i1(N, DCI, Subtarget);
 
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
-      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
-        SplatBitSize == 64) {
-      EVT VorrVT;
-      SDValue Val =
-          isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                            SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
-      if (Val.getNode()) {
-        SDValue Input =
-          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
-        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
-        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
-      }
-    }
-  }
-
   if (!Subtarget->isThumb1Only()) {
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1311,7 +1311,23 @@
       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
-      // Multiplication.
+      // Bitwise operations are custom but still cheap
+      { ISD::AND, MVT::v8i8, 1},
+      { ISD::OR, MVT::v8i8, 1},
+      { ISD::AND, MVT::v4i16, 1},
+      { ISD::OR, MVT::v4i16, 1},
+      { ISD::AND, MVT::v2i32, 1},
+      { ISD::OR, MVT::v2i32, 1},
+      { ISD::AND, MVT::v1i64, 1},
+      { ISD::OR, MVT::v1i64, 1},
+      { ISD::AND, MVT::v16i8, 1},
+      { ISD::OR, MVT::v16i8, 1},
+      { ISD::AND, MVT::v8i16, 1},
+      { ISD::OR, MVT::v8i16, 1},
+      { ISD::AND, MVT::v4i32, 1},
+      { ISD::OR, MVT::v4i32, 1},
+      { ISD::AND, MVT::v2i64, 1},
+      { ISD::OR, MVT::v2i64, 1},
     };
 
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
Index: llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
===================================================================
--- llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
+++ llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
@@ -115,11 +115,11 @@
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sadd'
@@ -431,11 +431,11 @@
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'ssub'
Index: llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
===================================================================
--- llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
+++ llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
@@ -148,14 +148,14 @@
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'add'
@@ -354,14 +354,14 @@
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sub'
Index: llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -3204,11 +3204,8 @@
 ; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
 ; CHECK-NEXT:    vmullt.u8 q2, q1, q0
 ; CHECK-NEXT:    vmullb.u8 q0, q1, q0
-; CHECK-NEXT:    vqshrnb.u16 q2, q2, #7
 ; CHECK-NEXT:    vqshrnb.u16 q0, q0, #7
-; CHECK-NEXT:    vmovlb.u8 q2, q2
-; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmovnt.i16 q0, q2
+; CHECK-NEXT:    vqshrnt.u16 q0, q2, #7
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB21_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
Index: llvm/test/CodeGen/Thumb2/mve-vqshl.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-vqshl.ll
+++ llvm/test/CodeGen/Thumb2/mve-vqshl.ll
@@ -725,13 +725,10 @@
 ; CHECK-NEXT:    vmovlt.u8 q3, q0
 ; CHECK-NEXT:    vmovlb.u8 q1, q1
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vshl.u16 q2, q3, q2
 ; CHECK-NEXT:    vshl.u16 q0, q0, q1
-; CHECK-NEXT:    vqmovnb.u16 q2, q2
+; CHECK-NEXT:    vshl.u16 q2, q3, q2
 ; CHECK-NEXT:    vqmovnb.u16 q0, q0
-; CHECK-NEXT:    vmovlb.u8 q2, q2
-; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmovnt.i16 q0, q2
+; CHECK-NEXT:    vqmovnt.u16 q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %e0 = zext <16 x i8> %s0 to <16 x i16>
@@ -1510,11 +1507,8 @@
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
 ; CHECK-NEXT:    vshl.u16 q1, r0
 ; CHECK-NEXT:    vshl.u16 q0, r0
-; CHECK-NEXT:    vqmovnb.u16 q1, q1
 ; CHECK-NEXT:    vqmovnb.u16 q0, q0
-; CHECK-NEXT:    vmovlb.u8 q1, q1
-; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmovnt.i16 q0, q1
+; CHECK-NEXT:    vqmovnt.u16 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %e0 = zext <16 x i8> %s0 to <16 x i16>
@@ -2045,13 +2039,10 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmovlt.u8 q1, q0
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vshl.i16 q1, q1, #3
 ; CHECK-NEXT:    vshl.i16 q0, q0, #3
-; CHECK-NEXT:    vqmovnb.u16 q1, q1
+; CHECK-NEXT:    vshl.i16 q1, q1, #3
 ; CHECK-NEXT:    vqmovnb.u16 q0, q0
-; CHECK-NEXT:    vmovlb.u8 q1, q1
-; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmovnt.i16 q0, q1
+; CHECK-NEXT:    vqmovnt.u16 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %e0 = zext <16 x i8> %s0 to <16 x i16>