diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1502,9 +1502,6 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) { - if (!isa(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); auto LT = TLI->getTypeLegalizationCost(DL, Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -1516,7 +1513,7 @@ if (cast(Src)->getElementCount() == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); - return LT.first * 2; + return isa(Src) ? LT.first * 2 : LT.first * 2 + LT.first; } InstructionCost AArch64TTIImpl::getGatherScatterOpCost( diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll --- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll +++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll @@ -3,50 +3,178 @@ define void @fixed() { ; CHECK-LABEL: 'fixed' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>* undef, i32 8, <2 x i1> undef, <2 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 8, <4 x i1> undef, <4 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 8, <8 x i1> undef, <8 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 8, <16 x i1> undef, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>* undef, i32 8, <2 x i1> undef, <2 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 8, <4 x i1> undef, <4 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 8, <8 x i1> undef, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 8, <2 x i1> undef, <2 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 8, <4 x i1> undef, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 8, <2 x i1> undef, <2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>* undef, i32 8, <2 x i1> undef, <2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* undef, i32 8, <4 x i1> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* undef, i32 8, <8 x i1> undef, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 8, <2 x i1> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 8, <4 x i1> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 8, <2 x i1> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 8, <4 x i1> undef, <4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* undef, i32 8, <32 x i1> undef, <32 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1i1 = call <1 x i1> @llvm.masked.load.v1i1.p0v1i1(<1 x i1>* undef, i32 8, <1 x i1> undef, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1 = call <2 x i1> @llvm.masked.load.v2i1.p0v2i1(<2 x i1>* undef, i32 8, <2 x i1> undef, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1 = call <4 x i1> @llvm.masked.load.v4i1.p0v4i1(<4 x i1>* undef, i32 8, <4 x i1> undef, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1 = call <8 x i1> @llvm.masked.load.v8i1.p0v8i1(<8 x i1>* undef, i32 8, <8 x i1> undef, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1 = call <16 x i1> @llvm.masked.load.v16i1.p0v16i1(<16 x i1>* undef, i32 8, <16 x i1> undef, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32i1 = call <32 x i1> @llvm.masked.load.v32i1.p0v32i1(<32 x i1>* undef, i32 8, <32 x i1> undef, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v64i1 = call <64 x i1> @llvm.masked.load.v64i1.p0v64i1(<64 x i1>* undef, i32 8, <64 x i1> undef, <64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v128i1 = call <128 x i1> @llvm.masked.load.v128i1.p0v128i1(<128 x i1>* undef, i32 8, <128 x i1> undef, <128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v256i1 = call <256 x i1> @llvm.masked.load.v256i1.p0v256i1(<256 x i1>* undef, i32 16, <256 x i1> undef, <256 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v512i1 = call <512 x i1> @llvm.masked.load.v512i1.p0v512i1(<512 x i1>* undef, i32 32, <512 x i1> undef, <512 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v1024i1 = call <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1>* undef, i32 64, <1024 x i1> undef, <1024 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1i8 = call <1 x i8> @llvm.masked.load.v1i8.p0v1i8(<1 x i8>* undef, i32 8, <1 x i1> undef, <1 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>* undef, i32 8, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 8, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 8, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 8, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32i8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 16, <32 x i1> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v64i8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 32, <64 x i1> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v128i8 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* undef, i32 64, <128 x i1> undef, <128 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>* undef, i32 128, <256 x i1> undef, <256 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v512i8 = call <512 x i8> @llvm.masked.load.v512i8.p0v512i8(<512 x i8>* undef, i32 256, <512 x i1> undef, <512 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v1024i8 = call <1024 x i8> @llvm.masked.load.v1024i8.p0v1024i8(<1024 x i8>* undef, i32 512, <1024 x i1> undef, <1024 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1i16 = call <1 x i16> @llvm.masked.load.v1i16.p0v1i16(<1 x i16>* undef, i32 8, <1 x i1> undef, <1 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>* undef, i32 8, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 8, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 8, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 8, <16 x i1> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 16, <32 x i1> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v64i16 = call <64 x i16> @llvm.masked.load.v64i16.p0v64i16(<64 x i16>* undef, i32 32, <64 x i1> undef, <64 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v128i16 = call <128 x i16> @llvm.masked.load.v128i16.p0v128i16(<128 x i16>* undef, i32 64, <128 x i1> undef, <128 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>* undef, i32 128, <256 x i1> undef, <256 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v512i16 = call <512 x i16> @llvm.masked.load.v512i16.p0v512i16(<512 x i16>* undef, i32 256, <512 x i1> undef, <512 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1i32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>* undef, i32 8, <1 x i1> undef, <1 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 8, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 8, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 8, <8 x i1> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 8, <16 x i1> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = call <32 x i32> @llvm.masked.load.v32i32.p0v32i32(<32 x i32>* undef, i32 16, <32 x i1> undef, <32 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = call <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32>* undef, i32 32, <64 x i1> undef, <64 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v128i32 = call <128 x i32> @llvm.masked.load.v128i32.p0v128i32(<128 x i32>* undef, i32 64, <128 x i1> undef, <128 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v256i32 = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* undef, i32 128, <256 x i1> undef, <256 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v512i32 = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* undef, i32 256, <512 x i1> undef, <512 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v1024i32 = call <1024 x i32> @llvm.masked.load.v1024i32.p0v1024i32(<1024 x i32>* undef, i32 512, <1024 x i1> undef, <1024 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %v2048i32 = call <2048 x i32> @llvm.masked.load.v2048i32.p0v2048i32(<2048 x i32>* undef, i32 1024, <2048 x i1> undef, <2048 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1i64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 4, <1 x i1> undef, <1 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 8, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8i64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 16, <8 x i1> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* undef, i32 32, <16 x i1> undef, <16 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v32i64 = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* undef, i32 64, <32 x i1> undef, <32 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v64i64 = call <64 x i64> @llvm.masked.load.v64i64.p0v64i64(<64 x i64>* undef, i32 128, <64 x i1> undef, <64 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v128i64 = call <128 x i64> @llvm.masked.load.v128i64.p0v128i64(<128 x i64>* undef, i32 256, <128 x i1> undef, <128 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v256i64 = call <256 x i64> @llvm.masked.load.v256i64.p0v256i64(<256 x i64>* undef, i32 512, <256 x i1> undef, <256 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1f16 = call <1 x half> @llvm.masked.load.v1f16.p0v1f16(<1 x half>* undef, i32 8, <1 x i1> undef, <1 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>* undef, i32 8, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* undef, i32 8, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* undef, i32 8, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* undef, i32 8, <16 x i1> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* undef, i32 8, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v64f16 = call <64 x half> @llvm.masked.load.v64f16.p0v64f16(<64 x half>* undef, i32 8, <64 x i1> undef, <64 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v128f16 = call <128 x half> @llvm.masked.load.v128f16.p0v128f16(<128 x half>* undef, i32 8, <128 x i1> undef, <128 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v256f16 = call <256 x half> @llvm.masked.load.v256f16.p0v256f16(<256 x half>* undef, i32 8, <256 x i1> undef, <256 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>* undef, i32 8, <512 x i1> undef, <512 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1f32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>* undef, i32 8, <1 x i1> undef, <1 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 8, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 8, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 8, <8 x i1> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 8, <16 x i1> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32f32 = call <32 x float> @llvm.masked.load.v32f32.p0v32f32(<32 x float>* undef, i32 8, <32 x i1> undef, <32 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64f32 = call <64 x float> @llvm.masked.load.v64f32.p0v64f32(<64 x float>* undef, i32 8, <64 x i1> undef, <64 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v128f32 = call <128 x float> @llvm.masked.load.v128f32.p0v128f32(<128 x float>* undef, i32 8, <128 x i1> undef, <128 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* undef, i32 8, <256 x i1> undef, <256 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v512f32 = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* undef, i32 8, <512 x i1> undef, <512 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v1024f32 = call <1024 x float> @llvm.masked.load.v1024f32.p0v1024f32(<1024 x float>* undef, i32 8, <1024 x i1> undef, <1024 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %v2048f32 = call <2048 x float> @llvm.masked.load.v2048f32.p0v2048f32(<2048 x float>* undef, i32 8, <2048 x i1> undef, <2048 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1f64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 2, <1 x i1> undef, <1 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 4, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 8, <4 x i1> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 16, <8 x i1> undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f64 = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* undef, i32 32, <16 x i1> undef, <16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v32f64 = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* undef, i32 64, <32 x i1> undef, <32 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v64f64 = call <64 x double> @llvm.masked.load.v64f64.p0v64f64(<64 x double>* undef, i32 128, <64 x i1> undef, <64 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void entry: ; Legal fixed-width integer types + %v1i1 = call <1 x i1> @llvm.masked.load.v1i1.p0v1i1(<1 x i1> *undef, i32 8, <1 x i1> undef, <1 x i1> undef) + %v2i1 = call <2 x i1> @llvm.masked.load.v2i1.p0v2i1(<2 x i1> *undef, i32 8, <2 x i1> undef, <2 x i1> undef) + %v4i1 = call <4 x i1> @llvm.masked.load.v4i1.p0v4i1(<4 x i1> *undef, i32 8, <4 x i1> undef, <4 x i1> undef) + %v8i1 = call <8 x i1> @llvm.masked.load.v8i1.p0v8i1(<8 x i1> *undef, i32 8, <8 x i1> undef, <8 x i1> undef) + %v16i1 = call <16 x i1> @llvm.masked.load.v16i1.p0v16i1(<16 x i1> *undef, i32 8, <16 x i1> undef, <16 x i1> undef) + %v32i1 = call <32 x i1> @llvm.masked.load.v32i1.p0v32i1(<32 x i1> *undef, i32 8, <32 x i1> undef, <32 x i1> undef) + %v64i1 = call <64 x i1> @llvm.masked.load.v64i1.p0v64i1(<64 x i1> *undef, i32 8, <64 x i1> undef, <64 x i1> undef) + %v128i1 = call <128 x i1> @llvm.masked.load.v128i1.p0v128i1(<128 x i1> *undef, i32 8, <128 x i1> undef, <128 x i1> undef) + %v256i1 = call <256 x i1> @llvm.masked.load.v256i1.p0v256i1(<256 x i1> *undef, i32 16, <256 x i1> undef, <256 x i1> undef) + %v512i1 = call <512 x i1> @llvm.masked.load.v512i1.p0v512i1(<512 x i1> *undef, i32 32, <512 x i1> undef, <512 x i1> undef) + %v1024i1 = call <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1> *undef, i32 64, <1024 x i1> undef, <1024 x i1> undef) + + %v1i8 = call <1 x i8> @llvm.masked.load.v1i8.p0v1i8(<1 x i8> *undef, i32 8, <1 x i1> undef, <1 x i8> undef) %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8> *undef, i32 8, <2 x i1> undef, <2 x i8> undef) %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8> *undef, i32 8, <4 x i1> undef, <4 x i8> undef) %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8> *undef, i32 8, <8 x i1> undef, <8 x i8> undef) %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8> *undef, i32 8, <16 x i1> undef, <16 x i8> undef) + %v32i8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8> *undef, i32 16, <32 x i1> undef, <32 x i8> undef) + %v64i8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8> *undef, i32 32, <64 x i1> undef, <64 x i8> undef) + %v128i8 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8> *undef, i32 64, <128 x i1> undef, <128 x i8> undef) + %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8> *undef, i32 128, <256 x i1> undef, <256 x i8> undef) + %v512i8 = call <512 x i8> @llvm.masked.load.v512i8.p0v512i8(<512 x i8> *undef, i32 256, <512 x i1> undef, <512 x i8> undef) + %v1024i8 = call <1024 x i8> @llvm.masked.load.v1024i8.p0v1024i8(<1024 x i8> *undef, i32 512, <1024 x i1> undef, <1024 x i8> undef) + + %v1i16 = call <1 x i16> @llvm.masked.load.v1i16.p0v1i16(<1 x i16> *undef, i32 8, <1 x i1> undef, <1 x i16> undef) %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16> *undef, i32 8, <2 x i1> undef, <2 x i16> undef) %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16> *undef, i32 8, <4 x i1> undef, <4 x i16> undef) %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16> *undef, i32 8, <8 x i1> undef, <8 x i16> undef) + %v16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16> *undef, i32 8, <16 x i1> undef, <16 x i16> undef) + %v32i16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16> *undef, i32 16, <32 x i1> undef, <32 x i16> undef) + %v64i16 = call <64 x i16> @llvm.masked.load.v64i16.p0v64i16(<64 x i16> *undef, i32 32, <64 x i1> undef, <64 x i16> undef) + %v128i16 = call <128 x i16> @llvm.masked.load.v128i16.p0v128i16(<128 x i16> *undef, i32 64, <128 x i1> undef, <128 x i16> undef) + %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16> *undef, i32 128, <256 x i1> undef, <256 x i16> undef) + %v512i16 = call <512 x i16> @llvm.masked.load.v512i16.p0v512i16(<512 x i16> *undef, i32 256, <512 x i1> undef, <512 x i16> undef) + + %v1i32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32> *undef, i32 8, <1 x i1> undef, <1 x i32> undef) %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32> *undef, i32 8, <2 x i1> undef, <2 x i32> undef) %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32> *undef, i32 8, <4 x i1> undef, <4 x i32> undef) + %v8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32> *undef, i32 8, <8 x i1> undef, <8 x i32> undef) + %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32> *undef, i32 8, <16 x i1> undef, <16 x i32> undef) + %v32i32 = call <32 x i32> @llvm.masked.load.v32i32.p0v32i32(<32 x i32> *undef, i32 16, <32 x i1> undef, <32 x i32> undef) + %v64i32 = call <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32> *undef, i32 32, <64 x i1> undef, <64 x i32> undef) + %v128i32 = call <128 x i32> @llvm.masked.load.v128i32.p0v128i32(<128 x i32> *undef, i32 64, <128 x i1> undef, <128 x i32> undef) + %v256i32 = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32> *undef, i32 128, <256 x i1> undef, <256 x i32> undef) + %v512i32 = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32> *undef, i32 256, <512 x i1> undef, <512 x i32> undef) + %v1024i32 = call <1024 x i32> @llvm.masked.load.v1024i32.p0v1024i32(<1024 x i32> *undef, i32 512, <1024 x i1> undef, <1024 x i32> undef) + %v2048i32 = call <2048 x i32> @llvm.masked.load.v2048i32.p0v2048i32(<2048 x i32> *undef, i32 1024, <2048 x i1> undef, <2048 x i32> undef) + + %v1i64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64> *undef, i32 4, <1 x i1> undef, <1 x i64> undef) %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64> *undef, i32 8, <2 x i1> undef, <2 x i64> undef) + %v8i64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64> *undef, i32 16, <8 x i1> undef, <8 x i64> undef) + %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64> *undef, i32 32, <16 x i1> undef, <16 x i64> undef) + %v32i64 = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64> *undef, i32 64, <32 x i1> undef, <32 x i64> undef) + %v64i64 = call <64 x i64> @llvm.masked.load.v64i64.p0v64i64(<64 x i64> *undef, i32 128, <64 x i1> undef, <64 x i64> undef) + %v128i64 = call <128 x i64> @llvm.masked.load.v128i64.p0v128i64(<128 x i64> *undef, i32 256, <128 x i1> undef, <128 x i64> undef) + %v256i64 = call <256 x i64> @llvm.masked.load.v256i64.p0v256i64(<256 x i64> *undef, i32 512, <256 x i1> undef, <256 x i64> undef) - ; Legal fixed-width floating point types + %v1f16 = call <1 x half> @llvm.masked.load.v1f16.p0v1f16(<1 x half> *undef, i32 8, <1 x i1> undef, <1 x half> undef) %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half> *undef, i32 8, <2 x i1> undef, <2 x half> undef) %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half> *undef, i32 8, <4 x i1> undef, <4 x half> undef) %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half> *undef, i32 8, <8 x i1> undef, <8 x half> undef) + %v16f16 = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half> *undef, i32 8, <16 x i1> undef, <16 x half> undef) + %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half> *undef, i32 8, <32 x i1> undef, <32 x half> undef) + %v64f16 = call <64 x half> @llvm.masked.load.v64f16.p0v64f16(<64 x half> *undef, i32 8, <64 x i1> undef, <64 x half> undef) + %v128f16 = call <128 x half> @llvm.masked.load.v128f16.p0v128f16(<128 x half> *undef, i32 8, <128 x i1> undef, <128 x half> undef) + %v256f16 = call <256 x half> @llvm.masked.load.v256f16.p0v256f16(<256 x half> *undef, i32 8, <256 x i1> undef, <256 x half> undef) + %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half> *undef, i32 8, <512 x i1> undef, <512 x half> undef) + + %v1f32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float> *undef, i32 8, <1 x i1> undef, <1 x float> undef) %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float> *undef, i32 8, <2 x i1> undef, <2 x float> undef) %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float> *undef, i32 8, <4 x i1> undef, <4 x float> undef) - %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double> *undef, i32 8, <2 x i1> undef, <2 x double> undef) + %v8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float> *undef, i32 8, <8 x i1> undef, <8 x float> undef) + %v16f32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float> *undef, i32 8, <16 x i1> undef, <16 x float> undef) + %v32f32 = call <32 x float> @llvm.masked.load.v32f32.p0v32f32(<32 x float> *undef, i32 8, <32 x i1> undef, <32 x float> undef) + %v64f32 = call <64 x float> @llvm.masked.load.v64f32.p0v64f32(<64 x float> *undef, i32 8, <64 x i1> undef, <64 x float> undef) + %v128f32 = call <128 x float> @llvm.masked.load.v128f32.p0v128f32(<128 x float> *undef, i32 8, <128 x i1> undef, <128 x float> undef) + %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float> *undef, i32 8, <256 x i1> undef, <256 x float> undef) + %v512f32 = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float> *undef, i32 8, <512 x i1> undef, <512 x float> undef) + %v1024f32 = call <1024 x float> @llvm.masked.load.v1024f32.p0v1024f32(<1024 x float> *undef, i32 8, <1024 x i1> undef, <1024 x float> undef) + %v2048f32 = call <2048 x float> @llvm.masked.load.v2048f32.p0v2048f32(<2048 x float> *undef, i32 8, <2048 x i1> undef, <2048 x float> undef) - ; A couple of examples of illegal fixed-width types - %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64> *undef, i32 8, <4 x i1> undef, <4 x i64> undef) - %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half> *undef, i32 8, <32 x i1> undef, <32 x half> undef) + %v1f64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double> *undef, i32 2, <1 x i1> undef, <1 x double> undef) + %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double> *undef, i32 4, <2 x i1> undef, <2 x double> undef) + %v4f64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double> *undef, i32 8, <4 x i1> undef, <4 x double> undef) + %v8f64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double> *undef, i32 16, <8 x i1> undef, <8 x double> undef) + %v16f64 = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double> *undef, i32 32, <16 x i1> undef, <16 x double> undef) + %v32f64 = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double> *undef, i32 64, <32 x i1> undef, <32 x double> undef) + %v64f64 = call <64 x double> @llvm.masked.load.v64f64.p0v64f64(<64 x double> *undef, i32 128, <64 x i1> undef, <64 x double> undef) ret void } @@ -54,92 +182,372 @@ define void @scalable() { ; CHECK-LABEL: 'scalable' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8 = call @llvm.masked.load.nxv2i8.p0nxv2i8(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8 = call @llvm.masked.load.nxv4i8.p0nxv4i8(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8 = call @llvm.masked.load.nxv8i8.p0nxv8i8(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call @llvm.masked.load.nxv16i8.p0nxv16i8(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16 = call @llvm.masked.load.nxv2i16.p0nxv2i16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16 = call @llvm.masked.load.nxv4i16.p0nxv4i16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call @llvm.masked.load.nxv8i16.p0nxv8i16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32 = call @llvm.masked.load.nxv2i32.p0nxv2i32(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call @llvm.masked.load.nxv4i32.p0nxv4i32(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call @llvm.masked.load.nxv2i64.p0nxv2i64(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = call @llvm.masked.load.nxv2f16.p0nxv2f16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = call @llvm.masked.load.nxv4f16.p0nxv4f16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = call @llvm.masked.load.nxv8f16.p0nxv8f16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32 = call @llvm.masked.load.nxv2f32.p0nxv2f32(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call @llvm.masked.load.nxv4f32.p0nxv4f32(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call @llvm.masked.load.nxv2f64.p0nxv2f64(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i64 = call @llvm.masked.load.nxv1i64.p0nxv1i64(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call @llvm.masked.load.nxv4i64.p0nxv4i64(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32f16 = call @llvm.masked.load.nxv32f16.p0nxv32f16(* undef, i32 8, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i1 = call @llvm.masked.load.nxv1i1.p0nxv1i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1 = call @llvm.masked.load.nxv2i1.p0nxv2i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1 = call @llvm.masked.load.nxv4i1.p0nxv4i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1 = call @llvm.masked.load.nxv8i1.p0nxv8i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1 = call @llvm.masked.load.nxv16i1.p0nxv16i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1 = call @llvm.masked.load.nxv32i1.p0nxv32i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64i1 = call @llvm.masked.load.nxv64i1.p0nxv64i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv128i1 = call @llvm.masked.load.nxv128i1.p0nxv128i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv256i1 = call @llvm.masked.load.nxv256i1.p0nxv256i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv512i1 = call @llvm.masked.load.nxv512i1.p0nxv512i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv1024i1 = call @llvm.masked.load.nxv1024i1.p0nxv1024i1(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i8 = call @llvm.masked.load.nxv1i8.p0nxv1i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8 = call @llvm.masked.load.nxv2i8.p0nxv2i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8 = call @llvm.masked.load.nxv4i8.p0nxv4i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8 = call @llvm.masked.load.nxv8i8.p0nxv8i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call @llvm.masked.load.nxv16i8.p0nxv16i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = call @llvm.masked.load.nxv32i8.p0nxv32i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64i8 = call @llvm.masked.load.nxv64i8.p0nxv64i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv128i8 = call @llvm.masked.load.nxv128i8.p0nxv128i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv256i8 = call @llvm.masked.load.nxv256i8.p0nxv256i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv512i8 = call @llvm.masked.load.nxv512i8.p0nxv512i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv1024i8 = call @llvm.masked.load.nxv1024i8.p0nxv1024i8(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i16 = call @llvm.masked.load.nxv1i16.p0nxv1i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16 = call @llvm.masked.load.nxv2i16.p0nxv2i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16 = call @llvm.masked.load.nxv4i16.p0nxv4i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call @llvm.masked.load.nxv8i16.p0nxv8i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = call @llvm.masked.load.nxv16i16.p0nxv16i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32i16 = call @llvm.masked.load.nxv32i16.p0nxv32i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv64i16 = call @llvm.masked.load.nxv64i16.p0nxv64i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv128i16 = call @llvm.masked.load.nxv128i16.p0nxv128i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv256i16 = call @llvm.masked.load.nxv256i16.p0nxv256i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv512i16 = call @llvm.masked.load.nxv512i16.p0nxv512i16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i32 = call @llvm.masked.load.nxv1i32.p0nxv1i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32 = call @llvm.masked.load.nxv2i32.p0nxv2i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call @llvm.masked.load.nxv4i32.p0nxv4i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = call @llvm.masked.load.nxv8i32.p0nxv8i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = call @llvm.masked.load.nxv16i32.p0nxv16i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32i32 = call @llvm.masked.load.nxv32i32.p0nxv32i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv64i32 = call @llvm.masked.load.nxv64i32.p0nxv64i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv128i32 = call @llvm.masked.load.nxv128i32.p0nxv128i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv256i32 = call @llvm.masked.load.nxv256i32.p0nxv256i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %nxv512i32 = call @llvm.masked.load.nxv512i32.p0nxv512i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %nxv1024i32 = call @llvm.masked.load.nxv1024i32.p0nxv1024i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %nxv2048i32 = call @llvm.masked.load.nxv2048i32.p0nxv2048i32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i64 = call @llvm.masked.load.nxv1i64.p0nxv1i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call @llvm.masked.load.nxv2i64.p0nxv2i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call @llvm.masked.load.nxv4i64.p0nxv4i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = call @llvm.masked.load.nxv8i64.p0nxv8i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16i64 = call @llvm.masked.load.nxv16i64.p0nxv16i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv32i64 = call @llvm.masked.load.nxv32i64.p0nxv32i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv64i64 = call @llvm.masked.load.nxv64i64.p0nxv64i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv128i64 = call @llvm.masked.load.nxv128i64.p0nxv128i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %nxv256i64 = call @llvm.masked.load.nxv256i64.p0nxv256i64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = call @llvm.masked.load.nxv1f16.p0nxv1f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = call @llvm.masked.load.nxv2f16.p0nxv2f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = call @llvm.masked.load.nxv4f16.p0nxv4f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = call @llvm.masked.load.nxv8f16.p0nxv8f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = call @llvm.masked.load.nxv16f16.p0nxv16f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32f16 = call @llvm.masked.load.nxv32f16.p0nxv32f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv64f16 = call @llvm.masked.load.nxv64f16.p0nxv64f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv128f16 = call @llvm.masked.load.nxv128f16.p0nxv128f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv256f16 = call @llvm.masked.load.nxv256f16.p0nxv256f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv512f16 = call @llvm.masked.load.nxv512f16.p0nxv512f16(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1f32 = call @llvm.masked.load.nxv1f32.p0nxv1f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32 = call @llvm.masked.load.nxv2f32.p0nxv2f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call @llvm.masked.load.nxv4f32.p0nxv4f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32 = call @llvm.masked.load.nxv8f32.p0nxv8f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32 = call @llvm.masked.load.nxv16f32.p0nxv16f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv32f32 = call @llvm.masked.load.nxv32f32.p0nxv32f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv64f32 = call @llvm.masked.load.nxv64f32.p0nxv64f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv128f32 = call @llvm.masked.load.nxv128f32.p0nxv128f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv256f32 = call @llvm.masked.load.nxv256f32.p0nxv256f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %nxv512f32 = call @llvm.masked.load.nxv512f32.p0nxv512f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %nxv1024f32 = call @llvm.masked.load.nxv1024f32.p0nxv1024f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %nxv2048f32 = call @llvm.masked.load.nxv2048f32.p0nxv2048f32(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64 = call @llvm.masked.load.nxv1f64.p0nxv1f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call @llvm.masked.load.nxv2f64.p0nxv2f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64 = call @llvm.masked.load.nxv4f64.p0nxv4f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = call @llvm.masked.load.nxv8f64.p0nxv8f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %nxv16f64 = call @llvm.masked.load.nxv16f64.p0nxv16f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv32f64 = call @llvm.masked.load.nxv32f64.p0nxv32f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %nxv64f64 = call @llvm.masked.load.nxv64f64.p0nxv64f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %nxv128f64 = call @llvm.masked.load.nxv128f64.p0nxv128f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %nxv256f64 = call @llvm.masked.load.nxv256f64.p0nxv256f64(* undef, i32 8, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: - ; Legal scalable integer types + ; integer types + %nxv1i1 = call @llvm.masked.load.nxv1i1.p0nxv1i1( *undef, i32 8, undef, undef) + %nxv2i1 = call @llvm.masked.load.nxv2i1.p0nxv2i1( *undef, i32 8, undef, undef) + %nxv4i1 = call @llvm.masked.load.nxv4i1.p0nxv4i1( *undef, i32 8, undef, undef) + %nxv8i1 = call @llvm.masked.load.nxv8i1.p0nxv8i1( *undef, i32 8, undef, undef) + %nxv16i1 = call @llvm.masked.load.nxv16i1.p0nxv16i1( *undef, i32 8, undef, undef) + %nxv32i1 = call @llvm.masked.load.nxv32i1.p0nxv32i1( *undef, i32 8, undef, undef) + %nxv64i1 = call @llvm.masked.load.nxv64i1.p0nxv64i1( *undef, i32 8, undef, undef) + %nxv128i1 = call @llvm.masked.load.nxv128i1.p0nxv128i1( *undef, i32 8, undef, undef) + %nxv256i1 = call @llvm.masked.load.nxv256i1.p0nxv256i1( *undef, i32 8, undef, undef) + %nxv512i1 = call @llvm.masked.load.nxv512i1.p0nxv512i1( *undef, i32 8, undef, undef) + %nxv1024i1 = call @llvm.masked.load.nxv1024i1.p0nxv1024i1( *undef, i32 8, undef, undef) + + %nxv1i8 = call @llvm.masked.load.nxv1i8.p0nxv1i8( *undef, i32 8, undef, undef) %nxv2i8 = call @llvm.masked.load.nxv2i8.p0nxv2i8( *undef, i32 8, undef, undef) %nxv4i8 = call @llvm.masked.load.nxv4i8.p0nxv4i8( *undef, i32 8, undef, undef) %nxv8i8 = call @llvm.masked.load.nxv8i8.p0nxv8i8( *undef, i32 8, undef, undef) %nxv16i8 = call @llvm.masked.load.nxv16i8.p0nxv16i8( *undef, i32 8, undef, undef) + %nxv32i8 = call @llvm.masked.load.nxv32i8.p0nxv32i8( *undef, i32 8, undef, undef) + %nxv64i8 = call @llvm.masked.load.nxv64i8.p0nxv64i8( *undef, i32 8, undef, undef) + %nxv128i8 = call @llvm.masked.load.nxv128i8.p0nxv128i8( *undef, i32 8, undef, undef) + %nxv256i8 = call @llvm.masked.load.nxv256i8.p0nxv256i8( *undef, i32 8, undef, undef) + %nxv512i8 = call @llvm.masked.load.nxv512i8.p0nxv512i8( *undef, i32 8, undef, undef) + %nxv1024i8 = call @llvm.masked.load.nxv1024i8.p0nxv1024i8( *undef, i32 8, undef, undef) + + %nxv1i16 = call @llvm.masked.load.nxv1i16.p0nxv1i16( *undef, i32 8, undef, undef) %nxv2i16 = call @llvm.masked.load.nxv2i16.p0nxv2i16( *undef, i32 8, undef, undef) %nxv4i16 = call @llvm.masked.load.nxv4i16.p0nxv4i16( *undef, i32 8, undef, undef) %nxv8i16 = call @llvm.masked.load.nxv8i16.p0nxv8i16( *undef, i32 8, undef, undef) + %nxv16i16 = call @llvm.masked.load.nxv16i16.p0nxv16i16( *undef, i32 8, undef, undef) + %nxv32i16 = call @llvm.masked.load.nxv32i16.p0nxv32i16( *undef, i32 8, undef, undef) + %nxv64i16 = call @llvm.masked.load.nxv64i16.p0nxv64i16( *undef, i32 8, undef, undef) + %nxv128i16 = call @llvm.masked.load.nxv128i16.p0nxv128i16( *undef, i32 8, undef, undef) + %nxv256i16 = call @llvm.masked.load.nxv256i16.p0nxv256i16( *undef, i32 8, undef, undef) + %nxv512i16 = call @llvm.masked.load.nxv512i16.p0nxv512i16( *undef, i32 8, undef, undef) + + %nxv1i32 = call @llvm.masked.load.nxv1i32.p0nxv1i32( *undef, i32 8, undef, undef) %nxv2i32 = call @llvm.masked.load.nxv2i32.p0nxv2i32( *undef, i32 8, undef, undef) %nxv4i32 = call @llvm.masked.load.nxv4i32.p0nxv4i32( *undef, i32 8, undef, undef) + %nxv8i32 = call @llvm.masked.load.nxv8i32.p0nxv8i32( *undef, i32 8, undef, undef) + %nxv16i32 = call @llvm.masked.load.nxv16i32.p0nxv16i32( *undef, i32 8, undef, undef) + %nxv32i32 = call @llvm.masked.load.nxv32i32.p0nxv32i32( *undef, i32 8, undef, undef) + %nxv64i32 = call @llvm.masked.load.nxv64i32.p0nxv64i32( *undef, i32 8, undef, undef) + %nxv128i32 = call @llvm.masked.load.nxv128i32.p0nxv128i32( *undef, i32 8, undef, undef) + %nxv256i32 = call @llvm.masked.load.nxv256i32.p0nxv256i32( *undef, i32 8, undef, undef) + %nxv512i32 = call @llvm.masked.load.nxv512i32.p0nxv512i32( *undef, i32 8, undef, undef) + %nxv1024i32 = call @llvm.masked.load.nxv1024i32.p0nxv1024i32( *undef, i32 8, undef, undef) + %nxv2048i32 = call @llvm.masked.load.nxv2048i32.p0nxv2048i32( *undef, i32 8, undef, undef) + + %nxv1i64 = call @llvm.masked.load.nxv1i64.p0nxv1i64( *undef, i32 8, undef, undef) %nxv2i64 = call @llvm.masked.load.nxv2i64.p0nxv2i64( *undef, i32 8, undef, undef) + %nxv4i64 = call @llvm.masked.load.nxv4i64.p0nxv4i64( *undef, i32 8, undef, undef) + %nxv8i64 = call @llvm.masked.load.nxv8i64.p0nxv8i64( *undef, i32 8, undef, undef) + %nxv16i64 = call @llvm.masked.load.nxv16i64.p0nxv16i64( *undef, i32 8, undef, undef) + %nxv32i64 = call @llvm.masked.load.nxv32i64.p0nxv32i64( *undef, i32 8, undef, undef) + %nxv64i64 = call @llvm.masked.load.nxv64i64.p0nxv64i64( *undef, i32 8, undef, undef) + %nxv128i64 = call @llvm.masked.load.nxv128i64.p0nxv128i64( *undef, i32 8, undef, undef) + %nxv256i64 = call @llvm.masked.load.nxv256i64.p0nxv256i64( *undef, i32 8, undef, undef) - ; Legal scalable floating point types + ; floating-point types + %nxv1f16 = call @llvm.masked.load.nxv1f16.p0nxv1f16( *undef, i32 8, undef, undef) %nxv2f16 = call @llvm.masked.load.nxv2f16.p0nxv2f16( *undef, i32 8, undef, undef) %nxv4f16 = call @llvm.masked.load.nxv4f16.p0nxv4f16( *undef, i32 8, undef, undef) %nxv8f16 = call @llvm.masked.load.nxv8f16.p0nxv8f16( *undef, i32 8, undef, undef) + %nxv16f16 = call @llvm.masked.load.nxv16f16.p0nxv16f16( *undef, i32 8, undef, undef) + %nxv32f16 = call @llvm.masked.load.nxv32f16.p0nxv32f16( *undef, i32 8, undef, undef) + %nxv64f16 = call @llvm.masked.load.nxv64f16.p0nxv64f16( *undef, i32 8, undef, undef) + %nxv128f16 = call @llvm.masked.load.nxv128f16.p0nxv128f16( *undef, i32 8, undef, undef) + %nxv256f16 = call @llvm.masked.load.nxv256f16.p0nxv256f16( *undef, i32 8, undef, undef) + %nxv512f16 = call @llvm.masked.load.nxv512f16.p0nxv512f16( *undef, i32 8, undef, undef) + + %nxv1f32 = call @llvm.masked.load.nxv1f32.p0nxv1f32( *undef, i32 8, undef, undef) %nxv2f32 = call @llvm.masked.load.nxv2f32.p0nxv2f32( *undef, i32 8, undef, undef) %nxv4f32 = call @llvm.masked.load.nxv4f32.p0nxv4f32( *undef, i32 8, undef, undef) - %nxv2f64 = call @llvm.masked.load.nxv2f64.p0nxv2f64( *undef, i32 8, undef, undef) + %nxv8f32 = call @llvm.masked.load.nxv8f32.p0nxv8f32( *undef, i32 8, undef, undef) + %nxv16f32 = call @llvm.masked.load.nxv16f32.p0nxv16f32( *undef, i32 8, undef, undef) + %nxv32f32 = call @llvm.masked.load.nxv32f32.p0nxv32f32( *undef, i32 8, undef, undef) + %nxv64f32 = call @llvm.masked.load.nxv64f32.p0nxv64f32( *undef, i32 8, undef, undef) + %nxv128f32 = call @llvm.masked.load.nxv128f32.p0nxv128f32( *undef, i32 8, undef, undef) + %nxv256f32 = call @llvm.masked.load.nxv256f32.p0nxv256f32( *undef, i32 8, undef, undef) + %nxv512f32 = call @llvm.masked.load.nxv512f32.p0nxv512f32( *undef, i32 8, undef, undef) + %nxv1024f32 = call @llvm.masked.load.nxv1024f32.p0nxv1024f32( *undef, i32 8, undef, undef) + %nxv2048f32 = call @llvm.masked.load.nxv2048f32.p0nxv2048f32( *undef, i32 8, undef, undef) - ; A couple of examples of illegal scalable types - %nxv1i64 = call @llvm.masked.load.nxv1i64.p0nxv1i64( *undef, i32 8, undef, undef) - %nxv4i64 = call @llvm.masked.load.nxv4i64.p0nxv4i64( *undef, i32 8, undef, undef) - %nxv32f16 = call @llvm.masked.load.nxv32f16.p0nxv32f16( *undef, i32 8, undef, undef) + %nxv1f64 = call @llvm.masked.load.nxv1f64.p0nxv1f64( *undef, i32 8, undef, undef) + %nxv2f64 = call @llvm.masked.load.nxv2f64.p0nxv2f64( *undef, i32 8, undef, undef) + %nxv4f64 = call @llvm.masked.load.nxv4f64.p0nxv4f64( *undef, i32 8, undef, undef) + %nxv8f64 = call @llvm.masked.load.nxv8f64.p0nxv8f64( *undef, i32 8, undef, undef) + %nxv16f64 = call @llvm.masked.load.nxv16f64.p0nxv16f64( *undef, i32 8, undef, undef) + %nxv32f64 = call @llvm.masked.load.nxv32f64.p0nxv32f64( *undef, i32 8, undef, undef) + %nxv64f64 = call @llvm.masked.load.nxv64f64.p0nxv64f64( *undef, i32 8, undef, undef) + %nxv128f64 = call @llvm.masked.load.nxv128f64.p0nxv128f64( *undef, i32 8, undef, undef) + %nxv256f64 = call @llvm.masked.load.nxv256f64.p0nxv256f64( *undef, i32 8, undef, undef) ret void } +declare <1 x i1> @llvm.masked.load.v1i1.p0v1i1(<1 x i1>*, i32, <1 x i1>, <1 x i1>) +declare <2 x i1> @llvm.masked.load.v2i1.p0v2i1(<2 x i1>*, i32, <2 x i1>, <2 x i1>) +declare <4 x i1> @llvm.masked.load.v4i1.p0v4i1(<4 x i1>*, i32, <4 x i1>, <4 x i1>) +declare <8 x i1> @llvm.masked.load.v8i1.p0v8i1(<8 x i1>*, i32, <8 x i1>, <8 x i1>) +declare <16 x i1> @llvm.masked.load.v16i1.p0v16i1(<16 x i1>*, i32, <16 x i1>, <16 x i1>) +declare <32 x i1> @llvm.masked.load.v32i1.p0v32i1(<32 x i1>*, i32, <32 x i1>, <32 x i1>) +declare <64 x i1> @llvm.masked.load.v64i1.p0v64i1(<64 x i1>*, i32, <64 x i1>, <64 x i1>) +declare <128 x i1> @llvm.masked.load.v128i1.p0v128i1(<128 x i1>*, i32, <128 x i1>, <128 x i1>) +declare <256 x i1> @llvm.masked.load.v256i1.p0v256i1(<256 x i1>*, i32, <256 x i1>, <256 x i1>) +declare <512 x i1> @llvm.masked.load.v512i1.p0v512i1(<512 x i1>*, i32, <512 x i1>, <512 x i1>) +declare <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1>*, i32, <1024 x i1>, <1024 x i1>) + +declare <1 x i8> @llvm.masked.load.v1i8.p0v1i8(<1 x i8>*, i32, <1 x i1>, <1 x i8>) declare <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>*, i32, <2 x i1>, <2 x i8>) declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) +declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>) +declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32, <128 x i1>, <128 x i8>) +declare <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>*, i32, <256 x i1>, <256 x i8>) +declare <512 x i8> @llvm.masked.load.v512i8.p0v512i8(<512 x i8>*, i32, <512 x i1>, <512 x i8>) +declare <1024 x i8> @llvm.masked.load.v1024i8.p0v1024i8(<1024 x i8>*, i32, <1024 x i1>, <1024 x i8>) + +declare <1 x i16> @llvm.masked.load.v1i16.p0v1i16(<1 x i16>*, i32, <1 x i1>, <1 x i16>) declare <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>*, i32, <2 x i1>, <2 x i16>) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>) +declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>) +declare <64 x i16> @llvm.masked.load.v64i16.p0v64i16(<64 x i16>*, i32, <64 x i1>, <64 x i16>) +declare <128 x i16> @llvm.masked.load.v128i16.p0v128i16(<128 x i16>*, i32, <128 x i1>, <128 x i16>) +declare <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>*, i32, <256 x i1>, <256 x i16>) +declare <512 x i16> @llvm.masked.load.v512i16.p0v512i16(<512 x i16>*, i32, <512 x i1>, <512 x i16>) + +declare <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>*, i32, <1 x i1>, <1 x i32>) declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <32 x i32> @llvm.masked.load.v32i32.p0v32i32(<32 x i32>*, i32, <32 x i1>, <32 x i32>) +declare <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32>*, i32, <64 x i1>, <64 x i32>) +declare <128 x i32> @llvm.masked.load.v128i32.p0v128i32(<128 x i32>*, i32, <128 x i1>, <128 x i32>) +declare <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>*, i32, <256 x i1>, <256 x i32>) +declare <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>*, i32, <512 x i1>, <512 x i32>) +declare <1024 x i32> @llvm.masked.load.v1024i32.p0v1024i32(<1024 x i32>*, i32, <1024 x i1>, <1024 x i32>) +declare <2048 x i32> @llvm.masked.load.v2048i32.p0v2048i32(<2048 x i32>*, i32, <2048 x i1>, <2048 x i32>) + +declare <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>*, i32, <1 x i1>, <1 x i64>) declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>) declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) +declare <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>) +declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>*, i32, <16 x i1>, <16 x i64>) +declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>*, i32, <32 x i1>, <32 x i64>) +declare <64 x i64> @llvm.masked.load.v64i64.p0v64i64(<64 x i64>*, i32, <64 x i1>, <64 x i64>) +declare <128 x i64> @llvm.masked.load.v128i64.p0v128i64(<128 x i64>*, i32, <128 x i1>, <128 x i64>) +declare <256 x i64> @llvm.masked.load.v256i64.p0v256i64(<256 x i64>*, i32, <256 x i1>, <256 x i64>) + +declare <1 x half> @llvm.masked.load.v1f16.p0v1f16(<1 x half>*, i32, <1 x i1>, <1 x half>) declare <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>) declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>) declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) +declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>) declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>) +declare <64 x half> @llvm.masked.load.v64f16.p0v64f16(<64 x half>*, i32, <64 x i1>, <64 x half>) +declare <128 x half> @llvm.masked.load.v128f16.p0v128f16(<128 x half>*, i32, <128 x i1>, <128 x half>) +declare <256 x half> @llvm.masked.load.v256f16.p0v256f16(<256 x half>*, i32, <256 x i1>, <256 x half>) +declare <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>*, i32, <512 x i1>, <512 x half>) + +declare <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>*, i32, <1 x i1>, <1 x float>) declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <32 x float> @llvm.masked.load.v32f32.p0v32f32(<32 x float>*, i32, <32 x i1>, <32 x float>) +declare <64 x float> @llvm.masked.load.v64f32.p0v64f32(<64 x float>*, i32, <64 x i1>, <64 x float>) +declare <128 x float> @llvm.masked.load.v128f32.p0v128f32(<128 x float>*, i32, <128 x i1>, <128 x float>) +declare <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>*, i32, <256 x i1>, <256 x float>) +declare <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>*, i32, <512 x i1>, <512 x float>) +declare <1024 x float> @llvm.masked.load.v1024f32.p0v1024f32(<1024 x float>*, i32, <1024 x i1>, <1024 x float>) +declare <2048 x float> @llvm.masked.load.v2048f32.p0v2048f32(<2048 x float>*, i32, <2048 x i1>, <2048 x float>) + +declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>) declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>*, i32, <16 x i1>, <16 x double>) +declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>*, i32, <32 x i1>, <32 x double>) +declare <64 x double> @llvm.masked.load.v64f64.p0v64f64(<64 x double>*, i32, <64 x i1>, <64 x double>) +declare <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>*, i32, <128 x i1>, <128 x double>) +declare <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>*, i32, <256 x i1>, <256 x double>) +declare @llvm.masked.load.nxv1i1.p0nxv1i1(*, i32, , ) +declare @llvm.masked.load.nxv2i1.p0nxv2i1(*, i32, , ) +declare @llvm.masked.load.nxv4i1.p0nxv4i1(*, i32, , ) +declare @llvm.masked.load.nxv8i1.p0nxv8i1(*, i32, , ) +declare @llvm.masked.load.nxv16i1.p0nxv16i1(*, i32, , ) +declare @llvm.masked.load.nxv32i1.p0nxv32i1(*, i32, , ) +declare @llvm.masked.load.nxv64i1.p0nxv64i1(*, i32, , ) +declare @llvm.masked.load.nxv128i1.p0nxv128i1(*, i32, , ) +declare @llvm.masked.load.nxv256i1.p0nxv256i1(*, i32, , ) +declare @llvm.masked.load.nxv512i1.p0nxv512i1(*, i32, , ) +declare @llvm.masked.load.nxv1024i1.p0nxv1024i1(*, i32, , ) +declare @llvm.masked.load.nxv1i8.p0nxv1i8(*, i32, , ) declare @llvm.masked.load.nxv2i8.p0nxv2i8(*, i32, , ) declare @llvm.masked.load.nxv4i8.p0nxv4i8(*, i32, , ) declare @llvm.masked.load.nxv8i8.p0nxv8i8(*, i32, , ) declare @llvm.masked.load.nxv16i8.p0nxv16i8(*, i32, , ) +declare @llvm.masked.load.nxv32i8.p0nxv32i8(*, i32, , ) +declare @llvm.masked.load.nxv64i8.p0nxv64i8(*, i32, , ) +declare @llvm.masked.load.nxv128i8.p0nxv128i8(*, i32, , ) +declare @llvm.masked.load.nxv256i8.p0nxv256i8(*, i32, , ) +declare @llvm.masked.load.nxv512i8.p0nxv512i8(*, i32, , ) +declare @llvm.masked.load.nxv1024i8.p0nxv1024i8(*, i32, , ) + +declare @llvm.masked.load.nxv1i16.p0nxv1i16(*, i32, , ) declare @llvm.masked.load.nxv2i16.p0nxv2i16(*, i32, , ) declare @llvm.masked.load.nxv4i16.p0nxv4i16(*, i32, , ) declare @llvm.masked.load.nxv8i16.p0nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv16i16.p0nxv16i16(*, i32, , ) +declare @llvm.masked.load.nxv32i16.p0nxv32i16(*, i32, , ) +declare @llvm.masked.load.nxv64i16.p0nxv64i16(*, i32, , ) +declare @llvm.masked.load.nxv128i16.p0nxv128i16(*, i32, , ) +declare @llvm.masked.load.nxv256i16.p0nxv256i16(*, i32, , ) +declare @llvm.masked.load.nxv512i16.p0nxv512i16(*, i32, , ) + +declare @llvm.masked.load.nxv1i32.p0nxv1i32(*, i32, , ) declare @llvm.masked.load.nxv2i32.p0nxv2i32(*, i32, , ) declare @llvm.masked.load.nxv4i32.p0nxv4i32(*, i32, , ) +declare @llvm.masked.load.nxv8i32.p0nxv8i32(*, i32, , ) +declare @llvm.masked.load.nxv16i32.p0nxv16i32(*, i32, , ) +declare @llvm.masked.load.nxv32i32.p0nxv32i32(*, i32, , ) +declare @llvm.masked.load.nxv64i32.p0nxv64i32(*, i32, , ) +declare @llvm.masked.load.nxv128i32.p0nxv128i32(*, i32, , ) +declare @llvm.masked.load.nxv256i32.p0nxv256i32(*, i32, , ) +declare @llvm.masked.load.nxv512i32.p0nxv512i32(*, i32, , ) +declare @llvm.masked.load.nxv1024i32.p0nxv1024i32(*, i32, , ) +declare @llvm.masked.load.nxv2048i32.p0nxv2048i32(*, i32, , ) + +declare @llvm.masked.load.nxv1i64.p0nxv1i64(*, i32, , ) declare @llvm.masked.load.nxv2i64.p0nxv2i64(*, i32, , ) declare @llvm.masked.load.nxv4i64.p0nxv4i64(*, i32, , ) -declare @llvm.masked.load.nxv1i64.p0nxv1i64(*, i32, , ) +declare @llvm.masked.load.nxv8i64.p0nxv8i64(*, i32, , ) +declare @llvm.masked.load.nxv16i64.p0nxv16i64(*, i32, , ) +declare @llvm.masked.load.nxv32i64.p0nxv32i64(*, i32, , ) +declare @llvm.masked.load.nxv64i64.p0nxv64i64(*, i32, , ) +declare @llvm.masked.load.nxv128i64.p0nxv128i64(*, i32, , ) +declare @llvm.masked.load.nxv256i64.p0nxv256i64(*, i32, , ) + +declare @llvm.masked.load.nxv1f16.p0nxv1f16(*, i32, , ) declare @llvm.masked.load.nxv2f16.p0nxv2f16(*, i32, , ) declare @llvm.masked.load.nxv4f16.p0nxv4f16(*, i32, , ) declare @llvm.masked.load.nxv8f16.p0nxv8f16(*, i32, , ) +declare @llvm.masked.load.nxv16f16.p0nxv16f16(*, i32, , ) declare @llvm.masked.load.nxv32f16.p0nxv32f16(*, i32, , ) +declare @llvm.masked.load.nxv64f16.p0nxv64f16(*, i32, , ) +declare @llvm.masked.load.nxv128f16.p0nxv128f16(*, i32, , ) +declare @llvm.masked.load.nxv256f16.p0nxv256f16(*, i32, , ) +declare @llvm.masked.load.nxv512f16.p0nxv512f16(*, i32, , ) +declare @llvm.masked.load.nxv1024f16.p0nxv1024f16(*, i32, , ) +declare @llvm.masked.load.nxv2048f16.p0nxv2048f16(*, i32, , ) + +declare @llvm.masked.load.nxv1f32.p0nxv1f32(*, i32, , ) declare @llvm.masked.load.nxv2f32.p0nxv2f32(*, i32, , ) declare @llvm.masked.load.nxv4f32.p0nxv4f32(*, i32, , ) +declare @llvm.masked.load.nxv8f32.p0nxv8f32(*, i32, , ) +declare @llvm.masked.load.nxv16f32.p0nxv16f32(*, i32, , ) +declare @llvm.masked.load.nxv32f32.p0nxv32f32(*, i32, , ) +declare @llvm.masked.load.nxv64f32.p0nxv64f32(*, i32, , ) +declare @llvm.masked.load.nxv128f32.p0nxv128f32(*, i32, , ) +declare @llvm.masked.load.nxv256f32.p0nxv256f32(*, i32, , ) +declare @llvm.masked.load.nxv512f32.p0nxv512f32(*, i32, , ) +declare @llvm.masked.load.nxv1024f32.p0nxv1024f32(*, i32, , ) +declare @llvm.masked.load.nxv2048f32.p0nxv2048f32(*, i32, , ) + +declare @llvm.masked.load.nxv1f64.p0nxv1f64(*, i32, , ) declare @llvm.masked.load.nxv2f64.p0nxv2f64(*, i32, , ) +declare @llvm.masked.load.nxv4f64.p0nxv4f64(*, i32, , ) +declare @llvm.masked.load.nxv8f64.p0nxv8f64(*, i32, , ) +declare @llvm.masked.load.nxv16f64.p0nxv16f64(*, i32, , ) +declare @llvm.masked.load.nxv32f64.p0nxv32f64(*, i32, , ) +declare @llvm.masked.load.nxv64f64.p0nxv64f64(*, i32, , ) +declare @llvm.masked.load.nxv128f64.p0nxv128f64(*, i32, , ) +declare @llvm.masked.load.nxv256f64.p0nxv256f64(*, i32, , )