Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -163,6 +163,12 @@ return getIntImmCost(Imm, Ty); } +static bool isLoadOrMaskedLoad(const Value *Val) { + if (const IntrinsicInst *IntrinsicOp = dyn_cast(Val)) + return IntrinsicOp->getIntrinsicID() == Intrinsic::masked_load; + return isa(Val); +} + int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I, CastContextHint CCH) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -189,8 +195,14 @@ if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src, nullptr, CCH); - // The extend of a load is free - if (I && isa(I->getOperand(0))) { + // Extending/Truncating masked load/stores is expensive because we can't split + // them. This means that we'll likely end up loading/storing each element + // individually (hence the high cost). + if (CCH == CastContextHint::MaskedExtOrTrunc && DstTy.getSizeInBits() > 128) + return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); + + // The extend of other kinds of load is free + if (I && isLoadOrMaskedLoad(I->getOperand(0))) { static const TypeConversionCostTblEntry LoadConversionTbl[] = { {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, Index: llvm/test/Analysis/CostModel/ARM/cast.ll =================================================================== --- llvm/test/Analysis/CostModel/ARM/cast.ll +++ llvm/test/Analysis/CostModel/ARM/cast.ll @@ -1930,3 +1930,291 @@ %h = bitcast i16 undef to half ret i32 undef } + +define void @masked_loads_and_stores() { +; CHECK-NEON-LABEL: 'masked_loads_and_stores' +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-MVE-LABEL: 'masked_loads_and_stores' +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-MAIN-LABEL: 'masked_loads_and_stores' +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8M-BASE-LABEL: 'masked_loads_and_stores' +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-V8R-LABEL: 'masked_loads_and_stores' +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) +; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) + %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) + %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) + + ; zext/sexts that fit in a 128 bits register + + %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16> + %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16> + + %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32> + %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32> + + ; zext/sexts that don't fit in a 128 bits register + + %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16> + %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32> + %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64> + %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16> + %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32> + %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64> + + %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32> + %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64> + %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32> + %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64> + + %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> + %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> + %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32> + %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64> + + %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64> + %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64> + + %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64> + %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64> + + ; trunc+stores that fit in a 128 bits register + + %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8> + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef) + + %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8> + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef) + + ; trunc+stores that don't fit in a 128 bits register + + %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16> + call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) + + %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32> + call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef) + + %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32> + call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef) + + %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16> + call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef) + + ret void +} + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) + +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) +declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32 immarg, <16 x i1>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32 immarg, <16 x i1>) Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -0,0 +1,121 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=TAILPRED + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-none-eabi" + +@Input = external dso_local local_unnamed_addr global i8*, align 8 + +define dso_local i32 @tp_reduces_vf(i8* nocapture %0, i32 %1) local_unnamed_addr #0 { + ; + ; When TP is disabled, this test should vectorize with a VF of 16. + ; When TP is enabled, this test should vectorize with a VF of 8. + ; + ; DEFAULT: load <16 x i8>, <16 x i8>* + ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> + ; DEFAULT: add <16 x i16> + ; DEFAULT-NOT: llvm.masked.load + ; DEFAULT-NOT: llvm.masked.store + ; + ; TAILPRED: llvm.masked.load.v8i8.p0v8i8 + ; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16> + ; TAILPRED: add <8 x i16> + ; TAILPRED: call void @llvm.masked.store.v8i8.p0v8i8 + ; TAILPRED-NOT: load <16 x i8>, <16 x i8>* + %3 = load i8*, i8** @Input, align 8, !tbaa !0 + %4 = sext i32 %1 to i64 + %5 = icmp eq i32 %1, 0 + br i1 %5, label %._crit_edge, label %.preheader47.preheader + +.preheader47.preheader: ; preds = %2 + br label %.preheader47 + +.preheader47: ; preds = %.preheader47.preheader, %53 + %.050 = phi i64 [ %54, %53 ], [ 0, %.preheader47.preheader ] + br label %.preheader + +._crit_edge.loopexit: ; preds = %53 + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %2 + ret i32 0 + +.preheader: ; preds = %52, %.preheader47 + %indvars.iv51 = phi i32 [ 1, %.preheader47 ], [ %indvars.iv.next52, %52 ] + %6 = mul nuw nsw i32 %indvars.iv51, 320 + br label %7 + +7: ; preds = %7, %.preheader + %indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ] + %8 = add nuw nsw i32 %6, %indvars.iv + %9 = add nsw i32 %8, -320 + %10 = add nsw i32 %8, -321 + %11 = getelementptr inbounds i8, i8* %3, i32 %10 + %12 = load i8, i8* %11, align 1, !tbaa !4 + %13 = sext i8 %12 to i32 + %14 = getelementptr inbounds i8, i8* %3, i32 %9 + %15 = load i8, i8* %14, align 1, !tbaa !4 + %16 = sext i8 %15 to i32 + %17 = add nsw i32 %8, -319 + %18 = getelementptr inbounds i8, i8* %3, i32 %17 + %19 = load i8, i8* %18, align 1, !tbaa !4 + %20 = sext i8 %19 to i32 + %21 = add nsw i32 %8, -1 + %22 = getelementptr inbounds i8, i8* %3, i32 %21 + %23 = load i8, i8* %22, align 1, !tbaa !4 + %24 = sext i8 %23 to i32 + %25 = getelementptr inbounds i8, i8* %3, i32 %8 + %26 = load i8, i8* %25, align 1, !tbaa !4 + %27 = sext i8 %26 to i32 + %28 = mul nsw i32 %27, 255 + %29 = add nuw nsw i32 %8, 1 + %30 = getelementptr inbounds i8, i8* %3, i32 %29 + %31 = load i8, i8* %30, align 1, !tbaa !4 + %32 = sext i8 %31 to i32 + %33 = add nuw nsw i32 %8, 320 + %34 = add nuw nsw i32 %8, 319 + %35 = getelementptr inbounds i8, i8* %3, i32 %34 + %36 = load i8, i8* %35, align 1, !tbaa !4 + %37 = sext i8 %36 to i32 + %38 = getelementptr inbounds i8, i8* %3, i32 %33 + %39 = load i8, i8* %38, align 1, !tbaa !4 + %40 = sext i8 %39 to i32 + %41 = add nuw nsw i32 %8, 321 + %42 = getelementptr inbounds i8, i8* %3, i32 %41 + %43 = load i8, i8* %42, align 1, !tbaa !4 + %44 = sext i8 %43 to i32 + %reass.add = add nsw i32 %16, %13 + %reass.add44 = add nsw i32 %reass.add, %20 + %reass.add45 = add nsw i32 %reass.add44, %24 + %45 = add nsw i32 %reass.add45, %32 + %46 = add nsw i32 %45, %37 + %47 = add nsw i32 %46, %40 + %reass.add46 = add nsw i32 %47, %44 + %reass.mul = mul nsw i32 %reass.add46, -28 + %48 = add nsw i32 %reass.mul, %28 + %49 = lshr i32 %48, 8 + %50 = trunc i32 %49 to i8 + %51 = getelementptr inbounds i8, i8* %0, i32 %8 + store i8 %50, i8* %51, align 1, !tbaa !4 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, 319 + br i1 %exitcond, label %52, label %7 + +52: ; preds = %7 + %indvars.iv.next52 = add nuw nsw i32 %indvars.iv51, 1 + %exitcond53 = icmp eq i32 %indvars.iv.next52, 239 + br i1 %exitcond53, label %53, label %.preheader + +53: ; preds = %52 + %54 = add nuw i64 %.050, 1 + %55 = icmp ult i64 %54, %4 + br i1 %55, label %.preheader47, label %._crit_edge.loopexit +} + +attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = !{!1, !1, i64 0} +!1 = !{!"any pointer", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"} +!4 = !{!2, !2, i64 0}