diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1502,9 +1502,6 @@
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
                                       TTI::TargetCostKind CostKind) {
-  if (!isa<ScalableVectorType>(Src))
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
   auto LT = TLI->getTypeLegalizationCost(DL, Src);
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
@@ -1516,7 +1513,7 @@
   if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
-  return LT.first * 2;
+  return isa<ScalableVectorType>(Src) ? LT.first * 2 : LT.first * 2 + LT.first;
 }
 
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -3,50 +3,178 @@
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>* undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>* undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>* undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* undef, i32 8, <32 x i1> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1i1 = call <1 x i1> @llvm.masked.load.v1i1.p0v1i1(<1 x i1>* undef, i32 8, <1 x i1> undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2i1 = call <2 x i1> @llvm.masked.load.v2i1.p0v2i1(<2 x i1>* undef, i32 8, <2 x i1> undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v4i1 = call <4 x i1> @llvm.masked.load.v4i1.p0v4i1(<4 x i1>* undef, i32 8, <4 x i1> undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v8i1 = call <8 x i1> @llvm.masked.load.v8i1.p0v8i1(<8 x i1>* undef, i32 8, <8 x i1> undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v16i1 = call <16 x i1> @llvm.masked.load.v16i1.p0v16i1(<16 x i1>* undef, i32 8, <16 x i1> undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v32i1 = call <32 x i1> @llvm.masked.load.v32i1.p0v32i1(<32 x i1>* undef, i32 8, <32 x i1> undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v64i1 = call <64 x i1> @llvm.masked.load.v64i1.p0v64i1(<64 x i1>* undef, i32 8, <64 x i1> undef, <64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v128i1 = call <128 x i1> @llvm.masked.load.v128i1.p0v128i1(<128 x i1>* undef, i32 8, <128 x i1> undef, <128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v256i1 = call <256 x i1> @llvm.masked.load.v256i1.p0v256i1(<256 x i1>* undef, i32 16, <256 x i1> undef, <256 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v512i1 = call <512 x i1> @llvm.masked.load.v512i1.p0v512i1(<512 x i1>* undef, i32 32, <512 x i1> undef, <512 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v1024i1 = call <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1>* undef, i32 64, <1024 x i1> undef, <1024 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1i8 = call <1 x i8> @llvm.masked.load.v1i8.p0v1i8(<1 x i8>* undef, i32 8, <1 x i1> undef, <1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>* undef, i32 8, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 8, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 8, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v32i8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 16, <32 x i1> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v64i8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 32, <64 x i1> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v128i8 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* undef, i32 64, <128 x i1> undef, <128 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>* undef, i32 128, <256 x i1> undef, <256 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v512i8 = call <512 x i8> @llvm.masked.load.v512i8.p0v512i8(<512 x i8>* undef, i32 256, <512 x i1> undef, <512 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v1024i8 = call <1024 x i8> @llvm.masked.load.v1024i8.p0v1024i8(<1024 x i8>* undef, i32 512, <1024 x i1> undef, <1024 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1i16 = call <1 x i16> @llvm.masked.load.v1i16.p0v1i16(<1 x i16>* undef, i32 8, <1 x i1> undef, <1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>* undef, i32 8, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 8, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 8, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v32i16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 16, <32 x i1> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v64i16 = call <64 x i16> @llvm.masked.load.v64i16.p0v64i16(<64 x i16>* undef, i32 32, <64 x i1> undef, <64 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v128i16 = call <128 x i16> @llvm.masked.load.v128i16.p0v128i16(<128 x i16>* undef, i32 64, <128 x i1> undef, <128 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>* undef, i32 128, <256 x i1> undef, <256 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v512i16 = call <512 x i16> @llvm.masked.load.v512i16.p0v512i16(<512 x i16>* undef, i32 256, <512 x i1> undef, <512 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1i32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>* undef, i32 8, <1 x i1> undef, <1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 8, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 8, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 8, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v32i32 = call <32 x i32> @llvm.masked.load.v32i32.p0v32i32(<32 x i32>* undef, i32 16, <32 x i1> undef, <32 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v64i32 = call <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32>* undef, i32 32, <64 x i1> undef, <64 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v128i32 = call <128 x i32> @llvm.masked.load.v128i32.p0v128i32(<128 x i32>* undef, i32 64, <128 x i1> undef, <128 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v256i32 = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* undef, i32 128, <256 x i1> undef, <256 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction:   %v512i32 = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* undef, i32 256, <512 x i1> undef, <512 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction:   %v1024i32 = call <1024 x i32> @llvm.masked.load.v1024i32.p0v1024i32(<1024 x i32>* undef, i32 512, <1024 x i1> undef, <1024 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction:   %v2048i32 = call <2048 x i32> @llvm.masked.load.v2048i32.p0v2048i32(<2048 x i32>* undef, i32 1024, <2048 x i1> undef, <2048 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1i64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 4, <1 x i1> undef, <1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v8i64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 16, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* undef, i32 32, <16 x i1> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v32i64 = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* undef, i32 64, <32 x i1> undef, <32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v64i64 = call <64 x i64> @llvm.masked.load.v64i64.p0v64i64(<64 x i64>* undef, i32 128, <64 x i1> undef, <64 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v128i64 = call <128 x i64> @llvm.masked.load.v128i64.p0v128i64(<128 x i64>* undef, i32 256, <128 x i1> undef, <128 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction:   %v256i64 = call <256 x i64> @llvm.masked.load.v256i64.p0v256i64(<256 x i64>* undef, i32 512, <256 x i1> undef, <256 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1f16 = call <1 x half> @llvm.masked.load.v1f16.p0v1f16(<1 x half>* undef, i32 8, <1 x i1> undef, <1 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>* undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v16f16 = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* undef, i32 8, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v64f16 = call <64 x half> @llvm.masked.load.v64f16.p0v64f16(<64 x half>* undef, i32 8, <64 x i1> undef, <64 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v128f16 = call <128 x half> @llvm.masked.load.v128f16.p0v128f16(<128 x half>* undef, i32 8, <128 x i1> undef, <128 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v256f16 = call <256 x half> @llvm.masked.load.v256f16.p0v256f16(<256 x half>* undef, i32 8, <256 x i1> undef, <256 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>* undef, i32 8, <512 x i1> undef, <512 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1f32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>* undef, i32 8, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 8, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 8, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v16f32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 8, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v32f32 = call <32 x float> @llvm.masked.load.v32f32.p0v32f32(<32 x float>* undef, i32 8, <32 x i1> undef, <32 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v64f32 = call <64 x float> @llvm.masked.load.v64f32.p0v64f32(<64 x float>* undef, i32 8, <64 x i1> undef, <64 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v128f32 = call <128 x float> @llvm.masked.load.v128f32.p0v128f32(<128 x float>* undef, i32 8, <128 x i1> undef, <128 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction:   %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* undef, i32 8, <256 x i1> undef, <256 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction:   %v512f32 = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* undef, i32 8, <512 x i1> undef, <512 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction:   %v1024f32 = call <1024 x float> @llvm.masked.load.v1024f32.p0v1024f32(<1024 x float>* undef, i32 8, <1024 x i1> undef, <1024 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction:   %v2048f32 = call <2048 x float> @llvm.masked.load.v2048f32.p0v2048f32(<2048 x float>* undef, i32 8, <2048 x i1> undef, <2048 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v1f64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 2, <1 x i1> undef, <1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction:   %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 4, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %v4f64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 8, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction:   %v8f64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 16, <8 x i1> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction:   %v16f64 = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* undef, i32 32, <16 x i1> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction:   %v32f64 = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* undef, i32 64, <32 x i1> undef, <32 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction:   %v64f64 = call <64 x double> @llvm.masked.load.v64f64.p0v64f64(<64 x double>* undef, i32 128, <64 x i1> undef, <64 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   ret void
 entry:
   ; Legal fixed-width integer types
+  %v1i1 = call <1 x i1> @llvm.masked.load.v1i1.p0v1i1(<1 x i1> *undef, i32 8, <1 x i1> undef, <1 x i1> undef)
+  %v2i1 = call <2 x i1> @llvm.masked.load.v2i1.p0v2i1(<2 x i1> *undef, i32 8, <2 x i1> undef, <2 x i1> undef)
+  %v4i1 = call <4 x i1> @llvm.masked.load.v4i1.p0v4i1(<4 x i1> *undef, i32 8, <4 x i1> undef, <4 x i1> undef)
+  %v8i1 = call <8 x i1> @llvm.masked.load.v8i1.p0v8i1(<8 x i1> *undef, i32 8, <8 x i1> undef, <8 x i1> undef)
+  %v16i1 = call <16 x i1> @llvm.masked.load.v16i1.p0v16i1(<16 x i1> *undef, i32 8, <16 x i1> undef, <16 x i1> undef)
+  %v32i1 = call <32 x i1> @llvm.masked.load.v32i1.p0v32i1(<32 x i1> *undef, i32 8, <32 x i1> undef, <32 x i1> undef)
+  %v64i1 = call <64 x i1> @llvm.masked.load.v64i1.p0v64i1(<64 x i1> *undef, i32 8, <64 x i1> undef, <64 x i1> undef)
+  %v128i1 = call <128 x i1> @llvm.masked.load.v128i1.p0v128i1(<128 x i1> *undef, i32 8, <128 x i1> undef, <128 x i1> undef)
+  %v256i1 = call <256 x i1> @llvm.masked.load.v256i1.p0v256i1(<256 x i1> *undef, i32 16, <256 x i1> undef, <256 x i1> undef)
+  %v512i1 = call <512 x i1> @llvm.masked.load.v512i1.p0v512i1(<512 x i1> *undef, i32 32, <512 x i1> undef, <512 x i1> undef)
+  %v1024i1 = call <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1> *undef, i32 64, <1024 x i1> undef, <1024 x i1> undef)
+
+  %v1i8 = call <1 x i8> @llvm.masked.load.v1i8.p0v1i8(<1 x i8> *undef, i32 8, <1 x i1> undef, <1 x i8> undef)
   %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8> *undef, i32 8, <2 x i1> undef, <2 x i8> undef)
   %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8> *undef, i32 8, <4 x i1> undef, <4 x i8> undef)
   %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8> *undef, i32 8, <8 x i1> undef, <8 x i8> undef)
   %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8> *undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+  %v32i8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8> *undef, i32 16, <32 x i1> undef, <32 x i8> undef)
+  %v64i8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8> *undef, i32 32, <64 x i1> undef, <64 x i8> undef)
+  %v128i8 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8> *undef, i32 64, <128 x i1> undef, <128 x i8> undef)
+  %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8> *undef, i32 128, <256 x i1> undef, <256 x i8> undef)
+  %v512i8 = call <512 x i8> @llvm.masked.load.v512i8.p0v512i8(<512 x i8> *undef, i32 256, <512 x i1> undef, <512 x i8> undef)
+  %v1024i8 = call <1024 x i8> @llvm.masked.load.v1024i8.p0v1024i8(<1024 x i8> *undef, i32 512, <1024 x i1> undef, <1024 x i8> undef)
+
+  %v1i16 = call <1 x i16> @llvm.masked.load.v1i16.p0v1i16(<1 x i16> *undef, i32 8, <1 x i1> undef, <1 x i16> undef)
   %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16> *undef, i32 8, <2 x i1> undef, <2 x i16> undef)
   %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16> *undef, i32 8, <4 x i1> undef, <4 x i16> undef)
   %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16> *undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+  %v16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16> *undef, i32 8, <16 x i1> undef, <16 x i16> undef)
+  %v32i16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16> *undef, i32 16, <32 x i1> undef, <32 x i16> undef)
+  %v64i16 = call <64 x i16> @llvm.masked.load.v64i16.p0v64i16(<64 x i16> *undef, i32 32, <64 x i1> undef, <64 x i16> undef)
+  %v128i16 = call <128 x i16> @llvm.masked.load.v128i16.p0v128i16(<128 x i16> *undef, i32 64, <128 x i1> undef, <128 x i16> undef)
+  %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16> *undef, i32 128, <256 x i1> undef, <256 x i16> undef)
+  %v512i16 = call <512 x i16> @llvm.masked.load.v512i16.p0v512i16(<512 x i16> *undef, i32 256, <512 x i1> undef, <512 x i16> undef)
+
+  %v1i32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32> *undef, i32 8, <1 x i1> undef, <1 x i32> undef)
   %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32> *undef, i32 8, <2 x i1> undef, <2 x i32> undef)
   %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32> *undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+  %v8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32> *undef, i32 8, <8 x i1> undef, <8 x i32> undef)
+  %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32> *undef, i32 8, <16 x i1> undef, <16 x i32> undef)
+  %v32i32 = call <32 x i32> @llvm.masked.load.v32i32.p0v32i32(<32 x i32> *undef, i32 16, <32 x i1> undef, <32 x i32> undef)
+  %v64i32 = call <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32> *undef, i32 32, <64 x i1> undef, <64 x i32> undef)
+  %v128i32 = call <128 x i32> @llvm.masked.load.v128i32.p0v128i32(<128 x i32> *undef, i32 64, <128 x i1> undef, <128 x i32> undef)
+  %v256i32 = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32> *undef, i32 128, <256 x i1> undef, <256 x i32> undef)
+  %v512i32 = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32> *undef, i32 256, <512 x i1> undef, <512 x i32> undef)
+  %v1024i32 = call <1024 x i32> @llvm.masked.load.v1024i32.p0v1024i32(<1024 x i32> *undef, i32 512, <1024 x i1> undef, <1024 x i32> undef)
+  %v2048i32 = call <2048 x i32> @llvm.masked.load.v2048i32.p0v2048i32(<2048 x i32> *undef, i32 1024, <2048 x i1> undef, <2048 x i32> undef)
+
+  %v1i64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64> *undef, i32 4, <1 x i1> undef, <1 x i64> undef)
   %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64> *undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+  %v8i64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64> *undef, i32 16, <8 x i1> undef, <8 x i64> undef)
+  %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64> *undef, i32 32, <16 x i1> undef, <16 x i64> undef)
+  %v32i64 = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64> *undef, i32 64, <32 x i1> undef, <32 x i64> undef)
+  %v64i64 = call <64 x i64> @llvm.masked.load.v64i64.p0v64i64(<64 x i64> *undef, i32 128, <64 x i1> undef, <64 x i64> undef)
+  %v128i64 = call <128 x i64> @llvm.masked.load.v128i64.p0v128i64(<128 x i64> *undef, i32 256, <128 x i1> undef, <128 x i64> undef)
+  %v256i64 = call <256 x i64> @llvm.masked.load.v256i64.p0v256i64(<256 x i64> *undef, i32 512, <256 x i1> undef, <256 x i64> undef)
 
-  ; Legal fixed-width floating point types
+  %v1f16 = call <1 x half> @llvm.masked.load.v1f16.p0v1f16(<1 x half> *undef, i32 8, <1 x i1> undef, <1 x half> undef)
   %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half> *undef, i32 8, <2 x i1> undef, <2 x half> undef)
   %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half> *undef, i32 8, <4 x i1> undef, <4 x half> undef)
   %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half> *undef, i32 8, <8 x i1> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half> *undef, i32 8, <16 x i1> undef, <16 x half> undef)
+  %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half> *undef, i32 8, <32 x i1> undef, <32 x half> undef)
+  %v64f16 = call <64 x half> @llvm.masked.load.v64f16.p0v64f16(<64 x half> *undef, i32 8, <64 x i1> undef, <64 x half> undef)
+  %v128f16 = call <128 x half> @llvm.masked.load.v128f16.p0v128f16(<128 x half> *undef, i32 8, <128 x i1> undef, <128 x half> undef)
+  %v256f16 = call <256 x half> @llvm.masked.load.v256f16.p0v256f16(<256 x half> *undef, i32 8, <256 x i1> undef, <256 x half> undef)
+  %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half> *undef, i32 8, <512 x i1> undef, <512 x half> undef)
+
+  %v1f32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float> *undef, i32 8, <1 x i1> undef, <1 x float> undef)
   %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float> *undef, i32 8, <2 x i1> undef, <2 x float> undef)
   %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float> *undef, i32 8, <4 x i1> undef, <4 x float> undef)
-  %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double> *undef, i32 8, <2 x i1> undef, <2 x double> undef)
+  %v8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float> *undef, i32 8, <8 x i1> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float> *undef, i32 8, <16 x i1> undef, <16 x float> undef)
+  %v32f32 = call <32 x float> @llvm.masked.load.v32f32.p0v32f32(<32 x float> *undef, i32 8, <32 x i1> undef, <32 x float> undef)
+  %v64f32 = call <64 x float> @llvm.masked.load.v64f32.p0v64f32(<64 x float> *undef, i32 8, <64 x i1> undef, <64 x float> undef)
+  %v128f32 = call <128 x float> @llvm.masked.load.v128f32.p0v128f32(<128 x float> *undef, i32 8, <128 x i1> undef, <128 x float> undef)
+  %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float> *undef, i32 8, <256 x i1> undef, <256 x float> undef)
+  %v512f32 = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float> *undef, i32 8, <512 x i1> undef, <512 x float> undef)
+  %v1024f32 = call <1024 x float> @llvm.masked.load.v1024f32.p0v1024f32(<1024 x float> *undef, i32 8, <1024 x i1> undef, <1024 x float> undef)
+  %v2048f32 = call <2048 x float> @llvm.masked.load.v2048f32.p0v2048f32(<2048 x float> *undef, i32 8, <2048 x i1> undef, <2048 x float> undef)
 
-  ; A couple of examples of illegal fixed-width types
-  %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64> *undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-  %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half> *undef, i32 8, <32 x i1> undef, <32 x half> undef)
+  %v1f64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double> *undef, i32 2, <1 x i1> undef, <1 x double> undef)
+  %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double> *undef, i32 4, <2 x i1> undef, <2 x double> undef)
+  %v4f64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double> *undef, i32 8, <4 x i1> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double> *undef, i32 16, <8 x i1> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double> *undef, i32 32, <16 x i1> undef, <16 x double> undef)
+  %v32f64 = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double> *undef, i32 64, <32 x i1> undef, <32 x double> undef)
+  %v64f64 = call <64 x double> @llvm.masked.load.v64f64.p0v64f64(<64 x double> *undef, i32 128, <64 x i1> undef, <64 x double> undef)
 
   ret void
 }
@@ -54,92 +182,372 @@
 
 define void @scalable() {
 ; CHECK-LABEL: 'scalable'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1i1 = call <vscale x 1 x i1> @llvm.masked.load.nxv1i1.p0nxv1i1(<vscale x 1 x i1>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2i1 = call <vscale x 2 x i1> @llvm.masked.load.nxv2i1.p0nxv2i1(<vscale x 2 x i1>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0nxv4i1(<vscale x 4 x i1>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv8i1 = call <vscale x 8 x i1> @llvm.masked.load.nxv8i1.p0nxv8i1(<vscale x 8 x i1>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv16i1 = call <vscale x 16 x i1> @llvm.masked.load.nxv16i1.p0nxv16i1(<vscale x 16 x i1>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv32i1 = call <vscale x 32 x i1> @llvm.masked.load.nxv32i1.p0nxv32i1(<vscale x 32 x i1>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv64i1 = call <vscale x 64 x i1> @llvm.masked.load.nxv64i1.p0nxv64i1(<vscale x 64 x i1>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv128i1 = call <vscale x 128 x i1> @llvm.masked.load.nxv128i1.p0nxv128i1(<vscale x 128 x i1>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv256i1 = call <vscale x 256 x i1> @llvm.masked.load.nxv256i1.p0nxv256i1(<vscale x 256 x i1>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv512i1 = call <vscale x 512 x i1> @llvm.masked.load.nxv512i1.p0nxv512i1(<vscale x 512 x i1>* undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv1024i1 = call <vscale x 1024 x i1> @llvm.masked.load.nxv1024i1.p0nxv1024i1(<vscale x 1024 x i1>* undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1i8 = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8.p0nxv1i8(<vscale x 1 x i8>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv32i8 = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0nxv32i8(<vscale x 32 x i8>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv64i8 = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0nxv64i8(<vscale x 64 x i8>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv128i8 = call <vscale x 128 x i8> @llvm.masked.load.nxv128i8.p0nxv128i8(<vscale x 128 x i8>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv256i8 = call <vscale x 256 x i8> @llvm.masked.load.nxv256i8.p0nxv256i8(<vscale x 256 x i8>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv512i8 = call <vscale x 512 x i8> @llvm.masked.load.nxv512i8.p0nxv512i8(<vscale x 512 x i8>* undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv1024i8 = call <vscale x 1024 x i8> @llvm.masked.load.nxv1024i8.p0nxv1024i8(<vscale x 1024 x i8>* undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x i8> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1i16 = call <vscale x 1 x i16> @llvm.masked.load.nxv1i16.p0nxv1i16(<vscale x 1 x i16>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv16i16 = call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0nxv16i16(<vscale x 16 x i16>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv32i16 = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16.p0nxv32i16(<vscale x 32 x i16>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv64i16 = call <vscale x 64 x i16> @llvm.masked.load.nxv64i16.p0nxv64i16(<vscale x 64 x i16>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv128i16 = call <vscale x 128 x i16> @llvm.masked.load.nxv128i16.p0nxv128i16(<vscale x 128 x i16>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv256i16 = call <vscale x 256 x i16> @llvm.masked.load.nxv256i16.p0nxv256i16(<vscale x 256 x i16>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv512i16 = call <vscale x 512 x i16> @llvm.masked.load.nxv512i16.p0nxv512i16(<vscale x 512 x i16>* undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1i32 = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0nxv1i32(<vscale x 1 x i32>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv8i32 = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0nxv8i32(<vscale x 8 x i32>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv16i32 = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p0nxv16i32(<vscale x 16 x i32>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv32i32 = call <vscale x 32 x i32> @llvm.masked.load.nxv32i32.p0nxv32i32(<vscale x 32 x i32>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv64i32 = call <vscale x 64 x i32> @llvm.masked.load.nxv64i32.p0nxv64i32(<vscale x 64 x i32>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv128i32 = call <vscale x 128 x i32> @llvm.masked.load.nxv128i32.p0nxv128i32(<vscale x 128 x i32>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv256i32 = call <vscale x 256 x i32> @llvm.masked.load.nxv256i32.p0nxv256i32(<vscale x 256 x i32>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction:   %nxv512i32 = call <vscale x 512 x i32> @llvm.masked.load.nxv512i32.p0nxv512i32(<vscale x 512 x i32>* undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction:   %nxv1024i32 = call <vscale x 1024 x i32> @llvm.masked.load.nxv1024i32.p0nxv1024i32(<vscale x 1024 x i32>* undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction:   %nxv2048i32 = call <vscale x 2048 x i32> @llvm.masked.load.nxv2048i32.p0nxv2048i32(<vscale x 2048 x i32>* undef, i32 8, <vscale x 2048 x i1> undef, <vscale x 2048 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv8i64 = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0nxv8i64(<vscale x 8 x i64>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv16i64 = call <vscale x 16 x i64> @llvm.masked.load.nxv16i64.p0nxv16i64(<vscale x 16 x i64>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv32i64 = call <vscale x 32 x i64> @llvm.masked.load.nxv32i64.p0nxv32i64(<vscale x 32 x i64>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv64i64 = call <vscale x 64 x i64> @llvm.masked.load.nxv64i64.p0nxv64i64(<vscale x 64 x i64>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv128i64 = call <vscale x 128 x i64> @llvm.masked.load.nxv128i64.p0nxv128i64(<vscale x 128 x i64>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction:   %nxv256i64 = call <vscale x 256 x i64> @llvm.masked.load.nxv256i64.p0nxv256i64(<vscale x 256 x i64>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1f16 = call <vscale x 1 x half> @llvm.masked.load.nxv1f16.p0nxv1f16(<vscale x 1 x half>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv16f16 = call <vscale x 16 x half> @llvm.masked.load.nxv16f16.p0nxv16f16(<vscale x 16 x half>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv64f16 = call <vscale x 64 x half> @llvm.masked.load.nxv64f16.p0nxv64f16(<vscale x 64 x half>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv128f16 = call <vscale x 128 x half> @llvm.masked.load.nxv128f16.p0nxv128f16(<vscale x 128 x half>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv256f16 = call <vscale x 256 x half> @llvm.masked.load.nxv256f16.p0nxv256f16(<vscale x 256 x half>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv512f16 = call <vscale x 512 x half> @llvm.masked.load.nxv512f16.p0nxv512f16(<vscale x 512 x half>* undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1f32 = call <vscale x 1 x float> @llvm.masked.load.nxv1f32.p0nxv1f32(<vscale x 1 x float>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv8f32 = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0nxv8f32(<vscale x 8 x float>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv16f32 = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0nxv16f32(<vscale x 16 x float>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv32f32 = call <vscale x 32 x float> @llvm.masked.load.nxv32f32.p0nxv32f32(<vscale x 32 x float>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv64f32 = call <vscale x 64 x float> @llvm.masked.load.nxv64f32.p0nxv64f32(<vscale x 64 x float>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv128f32 = call <vscale x 128 x float> @llvm.masked.load.nxv128f32.p0nxv128f32(<vscale x 128 x float>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv256f32 = call <vscale x 256 x float> @llvm.masked.load.nxv256f32.p0nxv256f32(<vscale x 256 x float>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction:   %nxv512f32 = call <vscale x 512 x float> @llvm.masked.load.nxv512f32.p0nxv512f32(<vscale x 512 x float>* undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction:   %nxv1024f32 = call <vscale x 1024 x float> @llvm.masked.load.nxv1024f32.p0nxv1024f32(<vscale x 1024 x float>* undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction:   %nxv2048f32 = call <vscale x 2048 x float> @llvm.masked.load.nxv2048f32.p0nxv2048f32(<vscale x 2048 x float>* undef, i32 8, <vscale x 2048 x i1> undef, <vscale x 2048 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction:   %nxv1f64 = call <vscale x 1 x double> @llvm.masked.load.nxv1f64.p0nxv1f64(<vscale x 1 x double>* undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %nxv4f64 = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %nxv8f64 = call <vscale x 8 x double> @llvm.masked.load.nxv8f64.p0nxv8f64(<vscale x 8 x double>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   %nxv16f64 = call <vscale x 16 x double> @llvm.masked.load.nxv16f64.p0nxv16f64(<vscale x 16 x double>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction:   %nxv32f64 = call <vscale x 32 x double> @llvm.masked.load.nxv32f64.p0nxv32f64(<vscale x 32 x double>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction:   %nxv64f64 = call <vscale x 64 x double> @llvm.masked.load.nxv64f64.p0nxv64f64(<vscale x 64 x double>* undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction:   %nxv128f64 = call <vscale x 128 x double> @llvm.masked.load.nxv128f64.p0nxv128f64(<vscale x 128 x double>* undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction:   %nxv256f64 = call <vscale x 256 x double> @llvm.masked.load.nxv256f64.p0nxv256f64(<vscale x 256 x double>* undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   ret void
 ;
 entry:
-  ; Legal scalable integer types
+  ; integer types
+  %nxv1i1 = call <vscale x 1 x i1> @llvm.masked.load.nxv1i1.p0nxv1i1(<vscale x 1 x i1> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i1> undef)
+  %nxv2i1 = call <vscale x 2 x i1> @llvm.masked.load.nxv2i1.p0nxv2i1(<vscale x 2 x i1> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i1> undef)
+  %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0nxv4i1(<vscale x 4 x i1> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
+  %nxv8i1 = call <vscale x 8 x i1> @llvm.masked.load.nxv8i1.p0nxv8i1(<vscale x 8 x i1> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i1> undef)
+  %nxv16i1 = call <vscale x 16 x i1> @llvm.masked.load.nxv16i1.p0nxv16i1(<vscale x 16 x i1> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i1> undef)
+  %nxv32i1 = call <vscale x 32 x i1> @llvm.masked.load.nxv32i1.p0nxv32i1(<vscale x 32 x i1> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef)
+  %nxv64i1 = call <vscale x 64 x i1> @llvm.masked.load.nxv64i1.p0nxv64i1(<vscale x 64 x i1> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i1> undef)
+  %nxv128i1 = call <vscale x 128 x i1> @llvm.masked.load.nxv128i1.p0nxv128i1(<vscale x 128 x i1> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i1> undef)
+  %nxv256i1 = call <vscale x 256 x i1> @llvm.masked.load.nxv256i1.p0nxv256i1(<vscale x 256 x i1> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i1> undef)
+  %nxv512i1 = call <vscale x 512 x i1> @llvm.masked.load.nxv512i1.p0nxv512i1(<vscale x 512 x i1> *undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i1> undef)
+  %nxv1024i1 = call <vscale x 1024 x i1> @llvm.masked.load.nxv1024i1.p0nxv1024i1(<vscale x 1024 x i1> *undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x i1> undef)
+
+  %nxv1i8 = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8.p0nxv1i8(<vscale x 1 x i8> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
   %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
   %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
   %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
   %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+  %nxv32i8 = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0nxv32i8(<vscale x 32 x i8> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+  %nxv64i8 = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0nxv64i8(<vscale x 64 x i8> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+  %nxv128i8 = call <vscale x 128 x i8> @llvm.masked.load.nxv128i8.p0nxv128i8(<vscale x 128 x i8> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i8> undef)
+  %nxv256i8 = call <vscale x 256 x i8> @llvm.masked.load.nxv256i8.p0nxv256i8(<vscale x 256 x i8> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i8> undef)
+  %nxv512i8 = call <vscale x 512 x i8> @llvm.masked.load.nxv512i8.p0nxv512i8(<vscale x 512 x i8> *undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i8> undef)
+  %nxv1024i8 = call <vscale x 1024 x i8> @llvm.masked.load.nxv1024i8.p0nxv1024i8(<vscale x 1024 x i8> *undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x i8> undef)
+
+  %nxv1i16 = call <vscale x 1 x i16> @llvm.masked.load.nxv1i16.p0nxv1i16(<vscale x 1 x i16> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
   %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
   %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
   %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+  %nxv16i16 = call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0nxv16i16(<vscale x 16 x i16> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+  %nxv32i16 = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16.p0nxv32i16(<vscale x 32 x i16> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+  %nxv64i16 = call <vscale x 64 x i16> @llvm.masked.load.nxv64i16.p0nxv64i16(<vscale x 64 x i16> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i16> undef)
+  %nxv128i16 = call <vscale x 128 x i16> @llvm.masked.load.nxv128i16.p0nxv128i16(<vscale x 128 x i16> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i16> undef)
+  %nxv256i16 = call <vscale x 256 x i16> @llvm.masked.load.nxv256i16.p0nxv256i16(<vscale x 256 x i16> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i16> undef)
+  %nxv512i16 = call <vscale x 512 x i16> @llvm.masked.load.nxv512i16.p0nxv512i16(<vscale x 512 x i16> *undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i16> undef)
+
+  %nxv1i32 = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0nxv1i32(<vscale x 1 x i32> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
   %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
   %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+  %nxv8i32 = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0nxv8i32(<vscale x 8 x i32> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+  %nxv16i32 = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p0nxv16i32(<vscale x 16 x i32> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+  %nxv32i32 = call <vscale x 32 x i32> @llvm.masked.load.nxv32i32.p0nxv32i32(<vscale x 32 x i32> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i32> undef)
+  %nxv64i32 = call <vscale x 64 x i32> @llvm.masked.load.nxv64i32.p0nxv64i32(<vscale x 64 x i32> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i32> undef)
+  %nxv128i32 = call <vscale x 128 x i32> @llvm.masked.load.nxv128i32.p0nxv128i32(<vscale x 128 x i32> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i32> undef)
+  %nxv256i32 = call <vscale x 256 x i32> @llvm.masked.load.nxv256i32.p0nxv256i32(<vscale x 256 x i32> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i32> undef)
+  %nxv512i32 = call <vscale x 512 x i32> @llvm.masked.load.nxv512i32.p0nxv512i32(<vscale x 512 x i32> *undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x i32> undef)
+  %nxv1024i32 = call <vscale x 1024 x i32> @llvm.masked.load.nxv1024i32.p0nxv1024i32(<vscale x 1024 x i32> *undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x i32> undef)
+  %nxv2048i32 = call <vscale x 2048 x i32> @llvm.masked.load.nxv2048i32.p0nxv2048i32(<vscale x 2048 x i32> *undef, i32 8, <vscale x 2048 x i1> undef, <vscale x 2048 x i32> undef)
+
+  %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
   %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+  %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+  %nxv8i64 = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0nxv8i64(<vscale x 8 x i64> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+  %nxv16i64 = call <vscale x 16 x i64> @llvm.masked.load.nxv16i64.p0nxv16i64(<vscale x 16 x i64> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+  %nxv32i64 = call <vscale x 32 x i64> @llvm.masked.load.nxv32i64.p0nxv32i64(<vscale x 32 x i64> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x i64> undef)
+  %nxv64i64 = call <vscale x 64 x i64> @llvm.masked.load.nxv64i64.p0nxv64i64(<vscale x 64 x i64> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x i64> undef)
+  %nxv128i64 = call <vscale x 128 x i64> @llvm.masked.load.nxv128i64.p0nxv128i64(<vscale x 128 x i64> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x i64> undef)
+  %nxv256i64 = call <vscale x 256 x i64> @llvm.masked.load.nxv256i64.p0nxv256i64(<vscale x 256 x i64> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x i64> undef)
 
-  ; Legal scalable floating point types
+  ; floating-point types
+  %nxv1f16 = call <vscale x 1 x half> @llvm.masked.load.nxv1f16.p0nxv1f16(<vscale x 1 x half> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
   %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
   %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
   %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+  %nxv16f16 = call <vscale x 16 x half> @llvm.masked.load.nxv16f16.p0nxv16f16(<vscale x 16 x half> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+  %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+  %nxv64f16 = call <vscale x 64 x half> @llvm.masked.load.nxv64f16.p0nxv64f16(<vscale x 64 x half> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x half> undef)
+  %nxv128f16 = call <vscale x 128 x half> @llvm.masked.load.nxv128f16.p0nxv128f16(<vscale x 128 x half> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x half> undef)
+  %nxv256f16 = call <vscale x 256 x half> @llvm.masked.load.nxv256f16.p0nxv256f16(<vscale x 256 x half> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x half> undef)
+  %nxv512f16 = call <vscale x 512 x half> @llvm.masked.load.nxv512f16.p0nxv512f16(<vscale x 512 x half> *undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x half> undef)
+
+  %nxv1f32 = call <vscale x 1 x float> @llvm.masked.load.nxv1f32.p0nxv1f32(<vscale x 1 x float> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
   %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
   %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-  %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+  %nxv8f32 = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0nxv8f32(<vscale x 8 x float> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+  %nxv16f32 = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0nxv16f32(<vscale x 16 x float> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+  %nxv32f32 = call <vscale x 32 x float> @llvm.masked.load.nxv32f32.p0nxv32f32(<vscale x 32 x float> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x float> undef)
+  %nxv64f32 = call <vscale x 64 x float> @llvm.masked.load.nxv64f32.p0nxv64f32(<vscale x 64 x float> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x float> undef)
+  %nxv128f32 = call <vscale x 128 x float> @llvm.masked.load.nxv128f32.p0nxv128f32(<vscale x 128 x float> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x float> undef)
+  %nxv256f32 = call <vscale x 256 x float> @llvm.masked.load.nxv256f32.p0nxv256f32(<vscale x 256 x float> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x float> undef)
+  %nxv512f32 = call <vscale x 512 x float> @llvm.masked.load.nxv512f32.p0nxv512f32(<vscale x 512 x float> *undef, i32 8, <vscale x 512 x i1> undef, <vscale x 512 x float> undef)
+  %nxv1024f32 = call <vscale x 1024 x float> @llvm.masked.load.nxv1024f32.p0nxv1024f32(<vscale x 1024 x float> *undef, i32 8, <vscale x 1024 x i1> undef, <vscale x 1024 x float> undef)
+  %nxv2048f32 = call <vscale x 2048 x float> @llvm.masked.load.nxv2048f32.p0nxv2048f32(<vscale x 2048 x float> *undef, i32 8, <vscale x 2048 x i1> undef, <vscale x 2048 x float> undef)
 
-  ; A couple of examples of illegal scalable types
-  %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-  %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-  %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+  %nxv1f64 = call <vscale x 1 x double> @llvm.masked.load.nxv1f64.p0nxv1f64(<vscale x 1 x double> *undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+  %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+  %nxv4f64 = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+  %nxv8f64 = call <vscale x 8 x double> @llvm.masked.load.nxv8f64.p0nxv8f64(<vscale x 8 x double> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+  %nxv16f64 = call <vscale x 16 x double> @llvm.masked.load.nxv16f64.p0nxv16f64(<vscale x 16 x double> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x double> undef)
+  %nxv32f64 = call <vscale x 32 x double> @llvm.masked.load.nxv32f64.p0nxv32f64(<vscale x 32 x double> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x double> undef)
+  %nxv64f64 = call <vscale x 64 x double> @llvm.masked.load.nxv64f64.p0nxv64f64(<vscale x 64 x double> *undef, i32 8, <vscale x 64 x i1> undef, <vscale x 64 x double> undef)
+  %nxv128f64 = call <vscale x 128 x double> @llvm.masked.load.nxv128f64.p0nxv128f64(<vscale x 128 x double> *undef, i32 8, <vscale x 128 x i1> undef, <vscale x 128 x double> undef)
+  %nxv256f64 = call <vscale x 256 x double> @llvm.masked.load.nxv256f64.p0nxv256f64(<vscale x 256 x double> *undef, i32 8, <vscale x 256 x i1> undef, <vscale x 256 x double> undef)
 
   ret void
 }
 
+declare <1 x i1> @llvm.masked.load.v1i1.p0v1i1(<1 x i1>*, i32, <1 x i1>, <1 x i1>)
+declare <2 x i1> @llvm.masked.load.v2i1.p0v2i1(<2 x i1>*, i32, <2 x i1>, <2 x i1>)
+declare <4 x i1> @llvm.masked.load.v4i1.p0v4i1(<4 x i1>*, i32, <4 x i1>, <4 x i1>)
+declare <8 x i1> @llvm.masked.load.v8i1.p0v8i1(<8 x i1>*, i32, <8 x i1>, <8 x i1>)
+declare <16 x i1> @llvm.masked.load.v16i1.p0v16i1(<16 x i1>*, i32, <16 x i1>, <16 x i1>)
+declare <32 x i1> @llvm.masked.load.v32i1.p0v32i1(<32 x i1>*, i32, <32 x i1>, <32 x i1>)
+declare <64 x i1> @llvm.masked.load.v64i1.p0v64i1(<64 x i1>*, i32, <64 x i1>, <64 x i1>)
+declare <128 x i1> @llvm.masked.load.v128i1.p0v128i1(<128 x i1>*, i32, <128 x i1>, <128 x i1>)
+declare <256 x i1> @llvm.masked.load.v256i1.p0v256i1(<256 x i1>*, i32, <256 x i1>, <256 x i1>)
+declare <512 x i1> @llvm.masked.load.v512i1.p0v512i1(<512 x i1>*, i32, <512 x i1>, <512 x i1>)
+declare <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1>*, i32, <1024 x i1>, <1024 x i1>)
+
+declare <1 x i8> @llvm.masked.load.v1i8.p0v1i8(<1 x i8>*, i32, <1 x i1>, <1 x i8>)
 declare <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>*, i32, <2 x i1>, <2 x i8>)
 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
+declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
+declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32, <128 x i1>, <128 x i8>)
+declare <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>*, i32, <256 x i1>, <256 x i8>)
+declare <512 x i8> @llvm.masked.load.v512i8.p0v512i8(<512 x i8>*, i32, <512 x i1>, <512 x i8>)
+declare <1024 x i8> @llvm.masked.load.v1024i8.p0v1024i8(<1024 x i8>*, i32, <1024 x i1>, <1024 x i8>)
+
+declare <1 x i16> @llvm.masked.load.v1i16.p0v1i16(<1 x i16>*, i32, <1 x i1>, <1 x i16>)
 declare <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>*, i32, <2 x i1>, <2 x i16>)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
+declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
+declare <64 x i16> @llvm.masked.load.v64i16.p0v64i16(<64 x i16>*, i32, <64 x i1>, <64 x i16>)
+declare <128 x i16> @llvm.masked.load.v128i16.p0v128i16(<128 x i16>*, i32, <128 x i1>, <128 x i16>)
+declare <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>*, i32, <256 x i1>, <256 x i16>)
+declare <512 x i16> @llvm.masked.load.v512i16.p0v512i16(<512 x i16>*, i32, <512 x i1>, <512 x i16>)
+
+declare <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>*, i32, <1 x i1>, <1 x i32>)
 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
+declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <32 x i32> @llvm.masked.load.v32i32.p0v32i32(<32 x i32>*, i32, <32 x i1>, <32 x i32>)
+declare <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32>*, i32, <64 x i1>, <64 x i32>)
+declare <128 x i32> @llvm.masked.load.v128i32.p0v128i32(<128 x i32>*, i32, <128 x i1>, <128 x i32>)
+declare <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>*, i32, <256 x i1>, <256 x i32>)
+declare <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>*, i32, <512 x i1>, <512 x i32>)
+declare <1024 x i32> @llvm.masked.load.v1024i32.p0v1024i32(<1024 x i32>*, i32, <1024 x i1>, <1024 x i32>)
+declare <2048 x i32> @llvm.masked.load.v2048i32.p0v2048i32(<2048 x i32>*, i32, <2048 x i1>, <2048 x i32>)
+
+declare <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>*, i32, <1 x i1>, <1 x i64>)
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
 declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
+declare <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>)
+declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>*, i32, <16 x i1>, <16 x i64>)
+declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>*, i32, <32 x i1>, <32 x i64>)
+declare <64 x i64> @llvm.masked.load.v64i64.p0v64i64(<64 x i64>*, i32, <64 x i1>, <64 x i64>)
+declare <128 x i64> @llvm.masked.load.v128i64.p0v128i64(<128 x i64>*, i32, <128 x i1>, <128 x i64>)
+declare <256 x i64> @llvm.masked.load.v256i64.p0v256i64(<256 x i64>*, i32, <256 x i1>, <256 x i64>)
+
+declare <1 x half> @llvm.masked.load.v1f16.p0v1f16(<1 x half>*, i32, <1 x i1>, <1 x half>)
 declare <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>)
 declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>)
 declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
+declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>)
 declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>)
+declare <64 x half> @llvm.masked.load.v64f16.p0v64f16(<64 x half>*, i32, <64 x i1>, <64 x half>)
+declare <128 x half> @llvm.masked.load.v128f16.p0v128f16(<128 x half>*, i32, <128 x i1>, <128 x half>)
+declare <256 x half> @llvm.masked.load.v256f16.p0v256f16(<256 x half>*, i32, <256 x i1>, <256 x half>)
+declare <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>*, i32, <512 x i1>, <512 x half>)
+
+declare <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>*, i32, <1 x i1>, <1 x float>)
 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
+declare <32 x float> @llvm.masked.load.v32f32.p0v32f32(<32 x float>*, i32, <32 x i1>, <32 x float>)
+declare <64 x float> @llvm.masked.load.v64f32.p0v64f32(<64 x float>*, i32, <64 x i1>, <64 x float>)
+declare <128 x float> @llvm.masked.load.v128f32.p0v128f32(<128 x float>*, i32, <128 x i1>, <128 x float>)
+declare <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>*, i32, <256 x i1>, <256 x float>)
+declare <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>*, i32, <512 x i1>, <512 x float>)
+declare <1024 x float> @llvm.masked.load.v1024f32.p0v1024f32(<1024 x float>*, i32, <1024 x i1>, <1024 x float>)
+declare <2048 x float> @llvm.masked.load.v2048f32.p0v2048f32(<2048 x float>*, i32, <2048 x i1>, <2048 x float>)
+
+declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>*, i32, <16 x i1>, <16 x double>)
+declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>*, i32, <32 x i1>, <32 x double>)
+declare <64 x double> @llvm.masked.load.v64f64.p0v64f64(<64 x double>*, i32, <64 x i1>, <64 x double>)
+declare <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>*, i32, <128 x i1>, <128 x double>)
+declare <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>*, i32, <256 x i1>, <256 x double>)
 
+declare <vscale x 1 x i1> @llvm.masked.load.nxv1i1.p0nxv1i1(<vscale x 1 x i1>*, i32, <vscale x 1 x i1>, <vscale x 1 x i1>)
+declare <vscale x 2 x i1> @llvm.masked.load.nxv2i1.p0nxv2i1(<vscale x 2 x i1>*, i32, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0nxv4i1(<vscale x 4 x i1>*, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.masked.load.nxv8i1.p0nxv8i1(<vscale x 8 x i1>*, i32, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.masked.load.nxv16i1.p0nxv16i1(<vscale x 16 x i1>*, i32, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.masked.load.nxv32i1.p0nxv32i1(<vscale x 32 x i1>*, i32, <vscale x 32 x i1>, <vscale x 32 x i1>)
+declare <vscale x 64 x i1> @llvm.masked.load.nxv64i1.p0nxv64i1(<vscale x 64 x i1>*, i32, <vscale x 64 x i1>, <vscale x 64 x i1>)
+declare <vscale x 128 x i1> @llvm.masked.load.nxv128i1.p0nxv128i1(<vscale x 128 x i1>*, i32, <vscale x 128 x i1>, <vscale x 128 x i1>)
+declare <vscale x 256 x i1> @llvm.masked.load.nxv256i1.p0nxv256i1(<vscale x 256 x i1>*, i32, <vscale x 256 x i1>, <vscale x 256 x i1>)
+declare <vscale x 512 x i1> @llvm.masked.load.nxv512i1.p0nxv512i1(<vscale x 512 x i1>*, i32, <vscale x 512 x i1>, <vscale x 512 x i1>)
+declare <vscale x 1024 x i1> @llvm.masked.load.nxv1024i1.p0nxv1024i1(<vscale x 1024 x i1>*, i32, <vscale x 1024 x i1>, <vscale x 1024 x i1>)
 
+declare <vscale x 1 x i8> @llvm.masked.load.nxv1i8.p0nxv1i8(<vscale x 1 x i8>*, i32, <vscale x 1 x i1>, <vscale x 1 x i8>)
 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8>*, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8>*, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>*, i32, <vscale x 8 x i1>, <vscale x 8 x i8>)
 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0nxv32i8(<vscale x 32 x i8>*, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
+declare <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0nxv64i8(<vscale x 64 x i8>*, i32, <vscale x 64 x i1>, <vscale x 64 x i8>)
+declare <vscale x 128 x i8> @llvm.masked.load.nxv128i8.p0nxv128i8(<vscale x 128 x i8>*, i32, <vscale x 128 x i1>, <vscale x 128 x i8>)
+declare <vscale x 256 x i8> @llvm.masked.load.nxv256i8.p0nxv256i8(<vscale x 256 x i8>*, i32, <vscale x 256 x i1>, <vscale x 256 x i8>)
+declare <vscale x 512 x i8> @llvm.masked.load.nxv512i8.p0nxv512i8(<vscale x 512 x i8>*, i32, <vscale x 512 x i1>, <vscale x 512 x i8>)
+declare <vscale x 1024 x i8> @llvm.masked.load.nxv1024i8.p0nxv1024i8(<vscale x 1024 x i8>*, i32, <vscale x 1024 x i1>, <vscale x 1024 x i8>)
+
+declare <vscale x 1 x i16> @llvm.masked.load.nxv1i16.p0nxv1i16(<vscale x 1 x i16>*, i32, <vscale x 1 x i1>, <vscale x 1 x i16>)
 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0nxv16i16(<vscale x 16 x i16>*, i32, <vscale x 16 x i1>, <vscale x 16 x i16>)
+declare <vscale x 32 x i16> @llvm.masked.load.nxv32i16.p0nxv32i16(<vscale x 32 x i16>*, i32, <vscale x 32 x i1>, <vscale x 32 x i16>)
+declare <vscale x 64 x i16> @llvm.masked.load.nxv64i16.p0nxv64i16(<vscale x 64 x i16>*, i32, <vscale x 64 x i1>, <vscale x 64 x i16>)
+declare <vscale x 128 x i16> @llvm.masked.load.nxv128i16.p0nxv128i16(<vscale x 128 x i16>*, i32, <vscale x 128 x i1>, <vscale x 128 x i16>)
+declare <vscale x 256 x i16> @llvm.masked.load.nxv256i16.p0nxv256i16(<vscale x 256 x i16>*, i32, <vscale x 256 x i1>, <vscale x 256 x i16>)
+declare <vscale x 512 x i16> @llvm.masked.load.nxv512i16.p0nxv512i16(<vscale x 512 x i16>*, i32, <vscale x 512 x i1>, <vscale x 512 x i16>)
+
+declare <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0nxv1i32(<vscale x 1 x i32>*, i32, <vscale x 1 x i1>, <vscale x 1 x i32>)
 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
 declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0nxv8i32(<vscale x 8 x i32>*, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
+declare <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p0nxv16i32(<vscale x 16 x i32>*, i32, <vscale x 16 x i1>, <vscale x 16 x i32>)
+declare <vscale x 32 x i32> @llvm.masked.load.nxv32i32.p0nxv32i32(<vscale x 32 x i32>*, i32, <vscale x 32 x i1>, <vscale x 32 x i32>)
+declare <vscale x 64 x i32> @llvm.masked.load.nxv64i32.p0nxv64i32(<vscale x 64 x i32>*, i32, <vscale x 64 x i1>, <vscale x 64 x i32>)
+declare <vscale x 128 x i32> @llvm.masked.load.nxv128i32.p0nxv128i32(<vscale x 128 x i32>*, i32, <vscale x 128 x i1>, <vscale x 128 x i32>)
+declare <vscale x 256 x i32> @llvm.masked.load.nxv256i32.p0nxv256i32(<vscale x 256 x i32>*, i32, <vscale x 256 x i1>, <vscale x 256 x i32>)
+declare <vscale x 512 x i32> @llvm.masked.load.nxv512i32.p0nxv512i32(<vscale x 512 x i32>*, i32, <vscale x 512 x i1>, <vscale x 512 x i32>)
+declare <vscale x 1024 x i32> @llvm.masked.load.nxv1024i32.p0nxv1024i32(<vscale x 1024 x i32>*, i32, <vscale x 1024 x i1>, <vscale x 1024 x i32>)
+declare <vscale x 2048 x i32> @llvm.masked.load.nxv2048i32.p0nxv2048i32(<vscale x 2048 x i32>*, i32, <vscale x 2048 x i1>, <vscale x 2048 x i32>)
+
+declare <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>*, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
 declare <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64>*, i32, <vscale x 4 x i1>, <vscale x 4 x i64>)
-declare <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>*, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+declare <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0nxv8i64(<vscale x 8 x i64>*, i32, <vscale x 8 x i1>, <vscale x 8 x i64>)
+declare <vscale x 16 x i64> @llvm.masked.load.nxv16i64.p0nxv16i64(<vscale x 16 x i64>*, i32, <vscale x 16 x i1>, <vscale x 16 x i64>)
+declare <vscale x 32 x i64> @llvm.masked.load.nxv32i64.p0nxv32i64(<vscale x 32 x i64>*, i32, <vscale x 32 x i1>, <vscale x 32 x i64>)
+declare <vscale x 64 x i64> @llvm.masked.load.nxv64i64.p0nxv64i64(<vscale x 64 x i64>*, i32, <vscale x 64 x i1>, <vscale x 64 x i64>)
+declare <vscale x 128 x i64> @llvm.masked.load.nxv128i64.p0nxv128i64(<vscale x 128 x i64>*, i32, <vscale x 128 x i1>, <vscale x 128 x i64>)
+declare <vscale x 256 x i64> @llvm.masked.load.nxv256i64.p0nxv256i64(<vscale x 256 x i64>*, i32, <vscale x 256 x i1>, <vscale x 256 x i64>)
+
+declare <vscale x 1 x half> @llvm.masked.load.nxv1f16.p0nxv1f16(<vscale x 1 x half>*, i32, <vscale x 1 x i1>, <vscale x 1 x half>)
 declare <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
 declare <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.masked.load.nxv16f16.p0nxv16f16(<vscale x 16 x half>*, i32, <vscale x 16 x i1>, <vscale x 16 x half>)
 declare <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half>*, i32, <vscale x 32 x i1>, <vscale x 32 x half>)
+declare <vscale x 64 x half> @llvm.masked.load.nxv64f16.p0nxv64f16(<vscale x 64 x half>*, i32, <vscale x 64 x i1>, <vscale x 64 x half>)
+declare <vscale x 128 x half> @llvm.masked.load.nxv128f16.p0nxv128f16(<vscale x 128 x half>*, i32, <vscale x 128 x i1>, <vscale x 128 x half>)
+declare <vscale x 256 x half> @llvm.masked.load.nxv256f16.p0nxv256f16(<vscale x 256 x half>*, i32, <vscale x 256 x i1>, <vscale x 256 x half>)
+declare <vscale x 512 x half> @llvm.masked.load.nxv512f16.p0nxv512f16(<vscale x 512 x half>*, i32, <vscale x 512 x i1>, <vscale x 512 x half>)
+declare <vscale x 1024 x half> @llvm.masked.load.nxv1024f16.p0nxv1024f16(<vscale x 1024 x half>*, i32, <vscale x 1024 x i1>, <vscale x 1024 x half>)
+declare <vscale x 2048 x half> @llvm.masked.load.nxv2048f16.p0nxv2048f16(<vscale x 2048 x half>*, i32, <vscale x 2048 x i1>, <vscale x 2048 x half>)
+
+declare <vscale x 1 x float> @llvm.masked.load.nxv1f32.p0nxv1f32(<vscale x 1 x float>*, i32, <vscale x 1 x i1>, <vscale x 1 x float>)
 declare <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
 declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0nxv8f32(<vscale x 8 x float>*, i32, <vscale x 8 x i1>, <vscale x 8 x float>)
+declare <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0nxv16f32(<vscale x 16 x float>*, i32, <vscale x 16 x i1>, <vscale x 16 x float>)
+declare <vscale x 32 x float> @llvm.masked.load.nxv32f32.p0nxv32f32(<vscale x 32 x float>*, i32, <vscale x 32 x i1>, <vscale x 32 x float>)
+declare <vscale x 64 x float> @llvm.masked.load.nxv64f32.p0nxv64f32(<vscale x 64 x float>*, i32, <vscale x 64 x i1>, <vscale x 64 x float>)
+declare <vscale x 128 x float> @llvm.masked.load.nxv128f32.p0nxv128f32(<vscale x 128 x float>*, i32, <vscale x 128 x i1>, <vscale x 128 x float>)
+declare <vscale x 256 x float> @llvm.masked.load.nxv256f32.p0nxv256f32(<vscale x 256 x float>*, i32, <vscale x 256 x i1>, <vscale x 256 x float>)
+declare <vscale x 512 x float> @llvm.masked.load.nxv512f32.p0nxv512f32(<vscale x 512 x float>*, i32, <vscale x 512 x i1>, <vscale x 512 x float>)
+declare <vscale x 1024 x float> @llvm.masked.load.nxv1024f32.p0nxv1024f32(<vscale x 1024 x float>*, i32, <vscale x 1024 x i1>, <vscale x 1024 x float>)
+declare <vscale x 2048 x float> @llvm.masked.load.nxv2048f32.p0nxv2048f32(<vscale x 2048 x float>*, i32, <vscale x 2048 x i1>, <vscale x 2048 x float>)
+
+declare <vscale x 1 x double> @llvm.masked.load.nxv1f64.p0nxv1f64(<vscale x 1 x double>*, i32, <vscale x 1 x i1>, <vscale x 1 x double>)
 declare <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>*, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.masked.load.nxv8f64.p0nxv8f64(<vscale x 8 x double>*, i32, <vscale x 8 x i1>, <vscale x 8 x double>)
+declare <vscale x 16 x double> @llvm.masked.load.nxv16f64.p0nxv16f64(<vscale x 16 x double>*, i32, <vscale x 16 x i1>, <vscale x 16 x double>)
+declare <vscale x 32 x double> @llvm.masked.load.nxv32f64.p0nxv32f64(<vscale x 32 x double>*, i32, <vscale x 32 x i1>, <vscale x 32 x double>)
+declare <vscale x 64 x double> @llvm.masked.load.nxv64f64.p0nxv64f64(<vscale x 64 x double>*, i32, <vscale x 64 x i1>, <vscale x 64 x double>)
+declare <vscale x 128 x double> @llvm.masked.load.nxv128f64.p0nxv128f64(<vscale x 128 x double>*, i32, <vscale x 128 x i1>, <vscale x 128 x double>)
+declare <vscale x 256 x double> @llvm.masked.load.nxv256f64.p0nxv256f64(<vscale x 256 x double>*, i32, <vscale x 256 x i1>, <vscale x 256 x double>)