Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -194,8 +194,17 @@ if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); - // The extend of a load is free - if (I && isa(I->getOperand(0))) { + // Extending masked load/Truncating masked stores is expensive because we + // currently don't split them. This means that we'll likely end up + // loading/storing each element individually (hence the high cost). + if (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || + Opcode == Instruction::SExt) + if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) + return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); + + // The extend of other kinds of load is free + if (CCH == TTI::CastContextHint::Normal || + CCH == TTI::CastContextHint::Masked) { static const TypeConversionCostTblEntry LoadConversionTbl[] = { {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, Index: llvm/test/Analysis/CostModel/ARM/cast.ll =================================================================== --- llvm/test/Analysis/CostModel/ARM/cast.ll +++ llvm/test/Analysis/CostModel/ARM/cast.ll @@ -1930,3 +1930,592 @@ %h = bitcast i16 undef to half ret i32 undef } + +define void @masked_load_to_ext() { +; CHECK-NEON-LABEL: 'masked_load_to_ext' +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_to_v4i16_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_to_v4i16_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_to_v4i32_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_to_v4i32_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_zext = zext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_sext = sext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_zext = zext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_sext = sext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-LABEL: 'masked_load_to_ext' +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_to_v4i16_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_to_v4i16_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8_to_v4i32_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8_to_v4i32_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_zext = zext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_sext = sext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_zext = zext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_sext = sext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-MAIN-LABEL: 'masked_load_to_ext' +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i16_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i16_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i32_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i32_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_zext = zext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_sext = sext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_zext = zext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_sext = sext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-BASE-LABEL: 'masked_load_to_ext' +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i16_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i16_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i32_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_to_v4i32_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_zext = zext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_sext = sext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_zext = zext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_sext = sext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8R-LABEL: 'masked_load_to_ext' +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_to_v4i16_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_to_v4i16_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_to_v4i32_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_to_v4i32_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_zext = zext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32_to_v8i64_sext = sext <8 x i32> %maskedloadv8i32 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i32_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i16_to_v16i64_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_zext = zext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i64_sext = sext <16 x i32> %maskedloadv16i32 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_to_v4i64_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %maskedloadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 1, <4 x i1> undef, <4 x i8> undef) + %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) + %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) + + %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) + %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %maskedloadv8i32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) + + %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %maskedloadv16i16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %maskedloadv16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) + + ; extend loads of types <128 bits to types <128 bits. + %v4i8_to_v4i16_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i16> + %v4i8_to_v4i16_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i16> + + %v4i8_to_v4i32_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i32> + %v4i8_to_v4i32_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i32> + + %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> + %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> + + %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> + %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> + + ; extend loads of types >128 bits to types >128 bits. + %v8i32_to_v8i64_zext = zext <8 x i32> %maskedloadv8i32 to <8 x i64> + %v8i32_to_v8i64_sext = sext <8 x i32> %maskedloadv8i32 to <8 x i64> + + %v16i16_to_v16i32_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i32> + %v16i16_to_v16i32_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i32> + + %v16i16_to_v16i64_zext = zext <16 x i16> %maskedloadv16i16 to <16 x i64> + %v16i16_to_v16i64_sext = sext <16 x i16> %maskedloadv16i16 to <16 x i64> + + %v16i32_to_v16i64_zext = zext <16 x i32> %maskedloadv16i32 to <16 x i64> + %v16i32_to_v16i64_sext = sext <16 x i32> %maskedloadv16i32 to <16 x i64> + + ; extend loads of types <128 bits to types >128 bits. + %v4i8_to_v4i64_zext = zext <4 x i8> %maskedloadv4i8 to <4 x i64> + %v4i8_to_v4i64_sext = sext <4 x i8> %maskedloadv4i8 to <4 x i64> + + %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> + %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> + + %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> + %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> + + %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> + %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> + %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> + %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> + + %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> + %v8i16_to_v8i32_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i32> + %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> + %v8i16_to_v8i64_sext = sext <8 x i16> %maskedloadv8i16 to <8 x i64> + + %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> + %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> + %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> + %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> + %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> + %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> + + ret void +} + +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) + +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>) +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) + +define void @trunc_to_masked_stores() { +; CHECK-NEON-LABEL: 'trunc_to_masked_stores' +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i8 = trunc <4 x i16> undef to <4 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i16_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_to_v4i8 = trunc <4 x i32> undef to <4 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i32_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_to_v4i8 = trunc <4 x i64> undef to <4 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i64_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_to_v4i32 = trunc <4 x i32> undef to <4 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i16_to_v4i32, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_to_v4i16 = trunc <4 x i64> undef to <4 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i64_to_v4i16, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_to_v4i32 = trunc <4 x i64> undef to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v4i64_to_v4i32, <4 x i32>* undef, i32 1, <4 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_to_v8i8_trunc = trunc <8 x i16> undef to <8 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i16_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_to_v8i8_trunc = trunc <8 x i32> undef to <8 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i32_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_to_v8i16_trunc = trunc <8 x i32> undef to <8 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i32_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i16_trunc = trunc <8 x i64> undef to <8 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i64_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_to_v16i8_trunc = trunc <16 x i16> undef to <16 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i16_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i64_to_v16i8_trunc = trunc <16 x i64> undef to <16 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i64_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-LABEL: 'trunc_to_masked_stores' +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i8 = trunc <4 x i16> undef to <4 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i16_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_to_v4i8 = trunc <4 x i32> undef to <4 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i32_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_to_v4i8 = trunc <4 x i64> undef to <4 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i64_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32 = trunc <4 x i32> undef to <4 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i16_to_v4i32, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_to_v4i16 = trunc <4 x i64> undef to <4 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i64_to_v4i16, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_to_v4i32 = trunc <4 x i64> undef to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v4i64_to_v4i32, <4 x i32>* undef, i32 1, <4 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_to_v8i8_trunc = trunc <8 x i16> undef to <8 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i16_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_to_v8i8_trunc = trunc <8 x i32> undef to <8 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i32_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_to_v8i16_trunc = trunc <8 x i32> undef to <8 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i32_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8i64_to_v8i16_trunc = trunc <8 x i64> undef to <8 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i64_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_to_v16i8_trunc = trunc <16 x i16> undef to <16 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i16_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v16i64_to_v16i8_trunc = trunc <16 x i64> undef to <16 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i64_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-MAIN-LABEL: 'trunc_to_masked_stores' +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i8 = trunc <4 x i16> undef to <4 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i16_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_to_v4i8 = trunc <4 x i32> undef to <4 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i32_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64_to_v4i8 = trunc <4 x i64> undef to <4 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i64_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32 = trunc <4 x i32> undef to <4 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i16_to_v4i32, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64_to_v4i16 = trunc <4 x i64> undef to <4 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i64_to_v4i16, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64_to_v4i32 = trunc <4 x i64> undef to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v4i64_to_v4i32, <4 x i32>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_to_v8i8_trunc = trunc <8 x i16> undef to <8 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i16_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_to_v8i8_trunc = trunc <8 x i32> undef to <8 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i32_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_to_v8i16_trunc = trunc <8 x i32> undef to <8 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i32_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64_to_v8i16_trunc = trunc <8 x i64> undef to <8 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i64_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_to_v16i8_trunc = trunc <16 x i16> undef to <16 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i16_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i64_to_v16i8_trunc = trunc <16 x i64> undef to <16 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i64_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-BASE-LABEL: 'trunc_to_masked_stores' +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i8 = trunc <4 x i16> undef to <4 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i16_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_to_v4i8 = trunc <4 x i32> undef to <4 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i32_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64_to_v4i8 = trunc <4 x i64> undef to <4 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i64_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32 = trunc <4 x i32> undef to <4 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i16_to_v4i32, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64_to_v4i16 = trunc <4 x i64> undef to <4 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i64_to_v4i16, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64_to_v4i32 = trunc <4 x i64> undef to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v4i64_to_v4i32, <4 x i32>* undef, i32 1, <4 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_to_v8i8_trunc = trunc <8 x i16> undef to <8 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i16_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_to_v8i8_trunc = trunc <8 x i32> undef to <8 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i32_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_to_v8i16_trunc = trunc <8 x i32> undef to <8 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i32_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64_to_v8i16_trunc = trunc <8 x i64> undef to <8 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i64_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_to_v16i8_trunc = trunc <16 x i16> undef to <16 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i16_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i64_to_v16i8_trunc = trunc <16 x i64> undef to <16 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i64_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8R-LABEL: 'trunc_to_masked_stores' +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i8 = trunc <4 x i16> undef to <4 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i16_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_to_v4i8 = trunc <4 x i32> undef to <4 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i32_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_to_v4i8 = trunc <4 x i64> undef to <4 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i64_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_to_v4i32 = trunc <4 x i32> undef to <4 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i16_to_v4i32, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_to_v4i16 = trunc <4 x i64> undef to <4 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i64_to_v4i16, <4 x i16>* undef, i32 1, <4 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_to_v4i32 = trunc <4 x i64> undef to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v4i64_to_v4i32, <4 x i32>* undef, i32 1, <4 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_to_v8i8_trunc = trunc <8 x i16> undef to <8 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i16_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_to_v8i8_trunc = trunc <8 x i32> undef to <8 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i32_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_to_v8i16_trunc = trunc <8 x i32> undef to <8 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i32_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i16_trunc = trunc <8 x i64> undef to <8 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i64_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_to_v16i8_trunc = trunc <16 x i16> undef to <16 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i16_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i64_to_v16i8_trunc = trunc <16 x i64> undef to <16 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i64_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + ; trunc to types <128 bits and store. + %v4i16_to_v4i8 = trunc <4 x i16> undef to <4 x i8> + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i16_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) + + %v4i32_to_v4i8 = trunc <4 x i32> undef to <4 x i8> + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i32_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) + + %v4i64_to_v4i8 = trunc <4 x i64> undef to <4 x i8> + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %v4i64_to_v4i8, <4 x i8>* undef, i32 1, <4 x i1> undef) + + %v4i16_to_v4i32 = trunc <4 x i32> undef to <4 x i16> + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i16_to_v4i32, <4 x i16>* undef, i32 1, <4 x i1> undef) + + %v4i64_to_v4i16 = trunc <4 x i64> undef to <4 x i16> + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %v4i64_to_v4i16, <4 x i16>* undef, i32 1, <4 x i1> undef) + + %v4i64_to_v4i32 = trunc <4 x i64> undef to <4 x i32> + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v4i64_to_v4i32, <4 x i32>* undef, i32 1, <4 x i1> undef) + + %v8i16_to_v8i8_trunc = trunc <8 x i16> undef to <8 x i8> + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i16_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) + + %v8i32_to_v8i8_trunc = trunc <8 x i32> undef to <8 x i8> + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i32_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) + + %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) + + %v8i32_to_v8i16_trunc = trunc <8 x i32> undef to <8 x i16> + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i32_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) + + %v8i64_to_v8i16_trunc = trunc <8 x i64> undef to <8 x i16> + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %v8i64_to_v8i16_trunc, <8 x i16>* undef, i32 1, <8 x i1> undef) + + %v16i16_to_v16i8_trunc = trunc <16 x i16> undef to <16 x i8> + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i16_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) + + %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) + + %v16i64_to_v16i8_trunc = trunc <16 x i64> undef to <16 x i8> + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i64_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) + + ; trunc and store types >128 bits. + %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> + call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) + + %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> + call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) + + %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> + call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) + + %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> + call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) + + ret void +} + +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32 immarg, <4 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>) + +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) +declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32 immarg, <16 x i1>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32 immarg, <16 x i1>) Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -0,0 +1,111 @@ +; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT +; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=TAILPRED + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-none-eabi" + +define i32 @tp_reduces_vf(i8* nocapture %0, i32 %1, i8** %input) { + ; + ; When TP is disabled, this test should vectorize with a VF of 16. + ; When TP is enabled, this test should vectorize with a VF of 8. + ; + ; DEFAULT: load <16 x i8>, <16 x i8>* + ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> + ; DEFAULT: add <16 x i16> + ; DEFAULT-NOT: llvm.masked.load + ; DEFAULT-NOT: llvm.masked.store + ; + ; TAILPRED: llvm.masked.load.v8i8.p0v8i8 + ; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16> + ; TAILPRED: add <8 x i16> + ; TAILPRED: call void @llvm.masked.store.v8i8.p0v8i8 + ; TAILPRED-NOT: load <16 x i8>, <16 x i8>* + %3 = load i8*, i8** %input, align 8 + %4 = sext i32 %1 to i64 + %5 = icmp eq i32 %1, 0 + br i1 %5, label %._crit_edge, label %.preheader47.preheader + +.preheader47.preheader: + br label %.preheader47 + +.preheader47: + %.050 = phi i64 [ %54, %53 ], [ 0, %.preheader47.preheader ] + br label %.preheader + +._crit_edge.loopexit: + br label %._crit_edge + +._crit_edge: + ret i32 0 + +.preheader: + %indvars.iv51 = phi i32 [ 1, %.preheader47 ], [ %indvars.iv.next52, %52 ] + %6 = mul nuw nsw i32 %indvars.iv51, 320 + br label %7 + +7: + %indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ] + %8 = add nuw nsw i32 %6, %indvars.iv + %9 = add nsw i32 %8, -320 + %10 = add nsw i32 %8, -321 + %11 = getelementptr inbounds i8, i8* %3, i32 %10 + %12 = load i8, i8* %11, align 1 + %13 = sext i8 %12 to i32 + %14 = getelementptr inbounds i8, i8* %3, i32 %9 + %15 = load i8, i8* %14, align 1 + %16 = sext i8 %15 to i32 + %17 = add nsw i32 %8, -319 + %18 = getelementptr inbounds i8, i8* %3, i32 %17 + %19 = load i8, i8* %18, align 1 + %20 = sext i8 %19 to i32 + %21 = add nsw i32 %8, -1 + %22 = getelementptr inbounds i8, i8* %3, i32 %21 + %23 = load i8, i8* %22, align 1 + %24 = sext i8 %23 to i32 + %25 = getelementptr inbounds i8, i8* %3, i32 %8 + %26 = load i8, i8* %25, align 1 + %27 = sext i8 %26 to i32 + %28 = mul nsw i32 %27, 255 + %29 = add nuw nsw i32 %8, 1 + %30 = getelementptr inbounds i8, i8* %3, i32 %29 + %31 = load i8, i8* %30, align 1 + %32 = sext i8 %31 to i32 + %33 = add nuw nsw i32 %8, 320 + %34 = add nuw nsw i32 %8, 319 + %35 = getelementptr inbounds i8, i8* %3, i32 %34 + %36 = load i8, i8* %35, align 1 + %37 = sext i8 %36 to i32 + %38 = getelementptr inbounds i8, i8* %3, i32 %33 + %39 = load i8, i8* %38, align 1 + %40 = sext i8 %39 to i32 + %41 = add nuw nsw i32 %8, 321 + %42 = getelementptr inbounds i8, i8* %3, i32 %41 + %43 = load i8, i8* %42, align 1 + %44 = sext i8 %43 to i32 + %reass.add = add nsw i32 %16, %13 + %reass.add44 = add nsw i32 %reass.add, %20 + %reass.add45 = add nsw i32 %reass.add44, %24 + %45 = add nsw i32 %reass.add45, %32 + %46 = add nsw i32 %45, %37 + %47 = add nsw i32 %46, %40 + %reass.add46 = add nsw i32 %47, %44 + %reass.mul = mul nsw i32 %reass.add46, -28 + %48 = add nsw i32 %reass.mul, %28 + %49 = lshr i32 %48, 8 + %50 = trunc i32 %49 to i8 + %51 = getelementptr inbounds i8, i8* %0, i32 %8 + store i8 %50, i8* %51, align 1 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, 319 + br i1 %exitcond, label %52, label %7 + +52: + %indvars.iv.next52 = add nuw nsw i32 %indvars.iv51, 1 + %exitcond53 = icmp eq i32 %indvars.iv.next52, 239 + br i1 %exitcond53, label %53, label %.preheader + +53: + %54 = add nuw i64 %.050, 1 + %55 = icmp ult i64 %54, %4 + br i1 %55, label %.preheader47, label %._crit_edge.loopexit +}