Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -163,6 +163,12 @@
   return getIntImmCost(Imm, Ty);
 }
 
+static bool isLoadOrMaskedLoad(const Value *Val) {
+  if (const IntrinsicInst *IntrinsicOp = dyn_cast<IntrinsicInst>(Val))
+    return IntrinsicOp->getIntrinsicID() == Intrinsic::masked_load;
+  return isa<LoadInst>(Val);
+}
+
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                  const Instruction *I, CastContextHint CCH) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -189,8 +195,14 @@
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src, nullptr, CCH);
 
-  // The extend of a load is free
-  if (I && isa<LoadInst>(I->getOperand(0))) {
+  // Extending/Truncating masked load/stores is expensive because we can't split
+  // them. This means that we'll likely end up loading/storing each element
+  // individually (hence the high cost).
+  if (CCH == CastContextHint::MaskedExtOrTrunc && DstTy.getSizeInBits() > 128)
+    return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
+
+  // The extend of other kinds of load is free
+  if (I && isLoadOrMaskedLoad(I->getOperand(0))) {
     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
Index: llvm/test/Analysis/CostModel/ARM/cast.ll
===================================================================
--- llvm/test/Analysis/CostModel/ARM/cast.ll
+++ llvm/test/Analysis/CostModel/ARM/cast.ll
@@ -1930,3 +1930,291 @@
   %h = bitcast i16 undef to half
   ret i32 undef
 }
+
+define void @masked_loads_and_stores() {
+; CHECK-NEON-LABEL: 'masked_loads_and_stores'
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-MVE-LABEL: 'masked_loads_and_stores'
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-V8M-MAIN-LABEL: 'masked_loads_and_stores'
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-V8M-BASE-LABEL: 'masked_loads_and_stores'
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-V8R-LABEL: 'masked_loads_and_stores'
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+
+  %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+  %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+
+  ; zext/sexts that fit in a 128 bits register
+
+  %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
+  %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
+
+  %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
+  %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
+
+  ; zext/sexts that don't fit in a 128 bits register
+
+  %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
+  %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
+  %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
+  %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
+  %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
+  %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
+
+  %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
+  %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
+  %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
+  %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
+
+  %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+  %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+  %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
+  %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
+
+  %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
+  %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
+
+  %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
+  %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
+
+  ; trunc+stores that fit in a 128 bits register
+
+  %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
+
+  %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
+
+  ; trunc+stores that don't fit in a 128 bits register
+
+  %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
+  call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+
+  %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
+  call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
+
+  %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
+  call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
+
+  %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
+  call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
+
+  ret void
+}
+
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32 immarg, <16 x i1>)
+declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32 immarg, <16 x i1>)
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
@@ -0,0 +1,121 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=TAILPRED
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+@Input = external dso_local local_unnamed_addr global i8*, align 8
+
+define dso_local i32 @tp_reduces_vf(i8* nocapture %0, i32 %1) local_unnamed_addr #0 {
+  ;
+  ; When TP is disabled, this test should vectorize with a VF of 16.
+  ; When TP is enabled, this test should vectorize with a VF of 8.
+  ;
+  ; DEFAULT: load <16 x i8>, <16 x i8>*
+  ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
+  ; DEFAULT: add <16 x i16>
+  ; DEFAULT-NOT: llvm.masked.load
+  ; DEFAULT-NOT: llvm.masked.store
+  ;
+  ; TAILPRED: llvm.masked.load.v8i8.p0v8i8
+  ; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16>
+  ; TAILPRED: add <8 x i16>
+  ; TAILPRED: call void @llvm.masked.store.v8i8.p0v8i8
+  ; TAILPRED-NOT: load <16 x i8>, <16 x i8>*
+  %3 = load i8*, i8** @Input, align 8, !tbaa !0
+  %4 = sext i32 %1 to i64
+  %5 = icmp eq i32 %1, 0
+  br i1 %5, label %._crit_edge, label %.preheader47.preheader
+
+.preheader47.preheader:                           ; preds = %2
+  br label %.preheader47
+
+.preheader47:                                     ; preds = %.preheader47.preheader, %53
+  %.050 = phi i64 [ %54, %53 ], [ 0, %.preheader47.preheader ]
+  br label %.preheader
+
+._crit_edge.loopexit:                             ; preds = %53
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %2
+  ret i32 0
+
+.preheader:                                       ; preds = %52, %.preheader47
+  %indvars.iv51 = phi i32 [ 1, %.preheader47 ], [ %indvars.iv.next52, %52 ]
+  %6 = mul nuw nsw i32 %indvars.iv51, 320
+  br label %7
+
+7:                                                ; preds = %7, %.preheader
+  %indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ]
+  %8 = add nuw nsw i32 %6, %indvars.iv
+  %9 = add nsw i32 %8, -320
+  %10 = add nsw i32 %8, -321
+  %11 = getelementptr inbounds i8, i8* %3, i32 %10
+  %12 = load i8, i8* %11, align 1, !tbaa !4
+  %13 = sext i8 %12 to i32
+  %14 = getelementptr inbounds i8, i8* %3, i32 %9
+  %15 = load i8, i8* %14, align 1, !tbaa !4
+  %16 = sext i8 %15 to i32
+  %17 = add nsw i32 %8, -319
+  %18 = getelementptr inbounds i8, i8* %3, i32 %17
+  %19 = load i8, i8* %18, align 1, !tbaa !4
+  %20 = sext i8 %19 to i32
+  %21 = add nsw i32 %8, -1
+  %22 = getelementptr inbounds i8, i8* %3, i32 %21
+  %23 = load i8, i8* %22, align 1, !tbaa !4
+  %24 = sext i8 %23 to i32
+  %25 = getelementptr inbounds i8, i8* %3, i32 %8
+  %26 = load i8, i8* %25, align 1, !tbaa !4
+  %27 = sext i8 %26 to i32
+  %28 = mul nsw i32 %27, 255
+  %29 = add nuw nsw i32 %8, 1
+  %30 = getelementptr inbounds i8, i8* %3, i32 %29
+  %31 = load i8, i8* %30, align 1, !tbaa !4
+  %32 = sext i8 %31 to i32
+  %33 = add nuw nsw i32 %8, 320
+  %34 = add nuw nsw i32 %8, 319
+  %35 = getelementptr inbounds i8, i8* %3, i32 %34
+  %36 = load i8, i8* %35, align 1, !tbaa !4
+  %37 = sext i8 %36 to i32
+  %38 = getelementptr inbounds i8, i8* %3, i32 %33
+  %39 = load i8, i8* %38, align 1, !tbaa !4
+  %40 = sext i8 %39 to i32
+  %41 = add nuw nsw i32 %8, 321
+  %42 = getelementptr inbounds i8, i8* %3, i32 %41
+  %43 = load i8, i8* %42, align 1, !tbaa !4
+  %44 = sext i8 %43 to i32
+  %reass.add = add nsw i32 %16, %13
+  %reass.add44 = add nsw i32 %reass.add, %20
+  %reass.add45 = add nsw i32 %reass.add44, %24
+  %45 = add nsw i32 %reass.add45, %32
+  %46 = add nsw i32 %45, %37
+  %47 = add nsw i32 %46, %40
+  %reass.add46 = add nsw i32 %47, %44
+  %reass.mul = mul nsw i32 %reass.add46, -28
+  %48 = add nsw i32 %reass.mul, %28
+  %49 = lshr i32 %48, 8
+  %50 = trunc i32 %49 to i8
+  %51 = getelementptr inbounds i8, i8* %0, i32 %8
+  store i8 %50, i8* %51, align 1, !tbaa !4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 319
+  br i1 %exitcond, label %52, label %7
+
+52:                                               ; preds = %7
+  %indvars.iv.next52 = add nuw nsw i32 %indvars.iv51, 1
+  %exitcond53 = icmp eq i32 %indvars.iv.next52, 239
+  br i1 %exitcond53, label %53, label %.preheader
+
+53:                                               ; preds = %52
+  %54 = add nuw i64 %.050, 1
+  %55 = icmp ult i64 %54, %4
+  br i1 %55, label %.preheader47, label %._crit_edge.loopexit
+}
+
+attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"any pointer", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!2, !2, i64 0}