Diff 263136

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Show First 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,

// xor a, -1 can always be folded to MVN		// xor a, -1 can always be folded to MVN
if (Opcode == Instruction::Xor && Imm.isAllOnesValue())		if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
return 0;		return 0;

return getIntImmCost(Imm, Ty, CostKind);		return getIntImmCost(Imm, Ty, CostKind);
}		}

		static bool isLoadOrMaskedLoad(const Value *Val) {
		if (const IntrinsicInst *IntrinsicOp = dyn_cast<IntrinsicInst>(Val))
		return IntrinsicOp->getIntrinsicID() == Intrinsic::masked_load;
		return isa<LoadInst>(Val);
		}

int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,		int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
TTI::CastContextHint CCH,		TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,		TTI::TargetCostKind CostKind,
const Instruction *I) {		const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);		int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");		assert(ISD && "Invalid opcode");

// Single to/from double precision conversions.		// Single to/from double precision conversions.
Show All 12 Lines	int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
}		}

EVT SrcTy = TLI->getValueType(DL, Src);		EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);		EVT DstTy = TLI->getValueType(DL, Dst);

if (!SrcTy.isSimple() \|\| !DstTy.isSimple())		if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind);		return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind);

// The extend of a load is free		// Extending masked load/Truncating masked stores is expensive because we
if (I && isa<LoadInst>(I->getOperand(0))) {		// currently don't split them. This means that we'll likely end up
		// loading/storing each element individually (hence the high cost).
		if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::ZExt \|\|
		Opcode == Instruction::SExt)
		if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
		return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();

		// The extend of other kinds of load is free
		if (I && isLoadOrMaskedLoad(I->getOperand(0))) {
		dmgreenAuthorUnsubmitted Done Reply Inline Actions Technically it's not that we can't split them, it's that we choose not to. Essentially we will have to end up paying the cost somewhere, and from a cost-modelling perspective, this is a good place to say that splitting them will be expensive, especially in a tail predicated loop where we won't be able to split a vctp sensibly. Also do we end up scalarize the masked load? That might change where we put the cost (we could just implement getMemoryOpCost and put the cost their instead). dmgreen: Technically it's not that we can't split them, it's that we choose not to. Essentially we will…
		dmgreenAuthorUnsubmitted Done Reply Inline Actions This should now be (CCH == Normal \|\| CCh == Masked) ? The costs wouldn't really be right for other types I don't think. We may need to fix those up later, but that can be done in a different patch. dmgreen: This should now be (CCH == Normal \|\| CCh == Masked) ? The costs wouldn't really be right for…
static const TypeConversionCostTblEntry LoadConversionTbl[] = {		static const TypeConversionCostTblEntry LoadConversionTbl[] = {
{ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},		{ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
{ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},		{ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
{ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},		{ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
{ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},		{ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
{ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},		{ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
{ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},		{ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
{ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},		{ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
▲ Show 20 Lines • Show All 1,226 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/ARM/cast.ll

Show First 20 Lines • Show All 1,924 Lines • ▼ Show 20 Lines	;
%c = bitcast i32 undef to float		%c = bitcast i32 undef to float
%d = bitcast float undef to i32		%d = bitcast float undef to i32
%e = bitcast i64 undef to double		%e = bitcast i64 undef to double
%f = bitcast double undef to i64		%f = bitcast double undef to i64
%g = bitcast half undef to i16		%g = bitcast half undef to i16
%h = bitcast i16 undef to half		%h = bitcast i16 undef to half
ret i32 undef		ret i32 undef
}		}

		define void @masked_loads_and_stores() {
		; CHECK-NEON-LABEL: 'masked_loads_and_stores'
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
		;
		; CHECK-MVE-LABEL: 'masked_loads_and_stores'
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
		;
		; CHECK-V8M-MAIN-LABEL: 'masked_loads_and_stores'
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
		;
		; CHECK-V8M-BASE-LABEL: 'masked_loads_and_stores'
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
		;
		; CHECK-V8R-LABEL: 'masked_loads_and_stores'
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)
		; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
		;

		%maskedloadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
		%maskedloadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
		%maskedloadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		%maskedloadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		%maskedloadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)

		; zext/sexts that fit in a 128 bits register

		%v8i8_to_v8i16_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i16>
		%v8i8_to_v8i16_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i16>

		%v4i16_to_v4i32_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i32>
		dmgreenAuthorUnsubmitted Done Reply Inline Actions Can you make sure there are tests for the legal types: <4xi8>-><4xi32>, <4xi16>-><4xi32> and <8xi8>-><8xi16> dmgreen: Can you make sure there are tests for the legal types: <4xi8>-><4xi32>, <4xi16>-><4xi32> and…
		%v4i16_to_v4i32_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i32>

		; zext/sexts that don't fit in a 128 bits register

		%v16i8_to_v16i16_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i16>
		%v16i8_to_v16i32_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i32>
		%v16i8_to_v16i64_zext = zext <16 x i8> %maskedloadv16i8 to <16 x i64>
		%v16i8_to_v16i16_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i16>
		%v16i8_to_v16i32_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i32>
		%v16i8_to_v16i64_sext = sext <16 x i8> %maskedloadv16i8 to <16 x i64>

		%v8i8_to_v8i32_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i32>
		%v8i8_to_v8i64_zext = zext <8 x i8> %maskedloadv8i8 to <8 x i64>
		%v8i8_to_v8i32_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i32>
		%v8i8_to_v8i64_sext = sext <8 x i8> %maskedloadv8i8 to <8 x i64>

		%v8i16_to_v8i32_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		%v8i16_to_v8i64_zext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>
		%v8i16_to_v8i32_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i32>
		%v8i16_to_v8i64_sext = zext <8 x i16> %maskedloadv8i16 to <8 x i64>

		%v4i16_to_v4i64_zext = zext <4 x i16> %maskedloadv4i16 to <4 x i64>
		%v4i16_to_v4i64_sext = sext <4 x i16> %maskedloadv4i16 to <4 x i64>

		%v4i32_to_v4i64_zext = zext <4 x i32> %maskedloadv4i32 to <4 x i64>
		%v4i32_to_v4i64_sext = sext <4 x i32> %maskedloadv4i32 to <4 x i64>

		; trunc+stores that fit in a 128 bits register

		%v8i64_to_v8i8_trunc = trunc <8 x i64> undef to <8 x i8>
		dmgreenAuthorUnsubmitted Done Reply Inline Actions Again you add stores for trunc of legal types, <4xi32>-><4xi8>, <4xi32>-><4xi16> and <8xi16>-><8xi8> dmgreen: Again you add stores for trunc of legal types, <4xi32>-><4xi8>, <4xi32>-><4xi16> and <8xi16>…
		call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %v8i64_to_v8i8_trunc, <8 x i8>* undef, i32 1, <8 x i1> undef)

		%v16i32_to_v16i8_trunc = trunc <16 x i32> undef to <16 x i8>
		call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %v16i32_to_v16i8_trunc, <16 x i8>* undef, i32 1, <16 x i1> undef)

		; trunc+stores that don't fit in a 128 bits register

		%v16i32_to_v16i16_trunc = trunc <16 x i32> undef to <16 x i16>
		call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i32_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)

		%v8i64_to_v8i32_trunc = trunc <8 x i64> undef to <8 x i32>
		call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v8i64_to_v8i32_trunc, <8 x i32>* undef, i32 1, <8 x i1> undef)

		%v16i64_to_v16i32_trunc = trunc <16 x i64> undef to <16 x i32>
		call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %v16i64_to_v16i32_trunc, <16 x i32>* undef, i32 1, <16 x i1> undef)

		%v16i64_to_v16i16_trunc = trunc <16 x i64> undef to <16 x i16>
		call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %v16i64_to_v16i16_trunc, <16 x i16>* undef, i32 1, <16 x i1> undef)

		ret void
		}

		declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
		declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
		declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
		declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
		declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)

		declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)
		declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>)
		declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
		declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32 immarg, <16 x i1>)
		declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32 immarg, <16 x i1>)

llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll

This file was added.

				; RUN: opt < %s -loop-vectorize -S \| FileCheck %s --check-prefixes=DEFAULT
				; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S \| FileCheck %s --check-prefixes=TAILPRED

				target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
				target triple = "thumbv8.1m.main-arm-none-eabi"

				@Input = external dso_local local_unnamed_addr global i8*, align 8

				define dso_local i32 @tp_reduces_vf(i8* nocapture %0, i32 %1) local_unnamed_addr #0 {
				;
				; When TP is disabled, this test should vectorize with a VF of 16.
				; When TP is enabled, this test should vectorize with a VF of 8.
				;
				; DEFAULT: load <16 x i8>, <16 x i8>*
				; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
				; DEFAULT: add <16 x i16>
				; DEFAULT-NOT: llvm.masked.load
				; DEFAULT-NOT: llvm.masked.store
				;
				; TAILPRED: llvm.masked.load.v8i8.p0v8i8
				; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16>
				; TAILPRED: add <8 x i16>
				; TAILPRED: call void @llvm.masked.store.v8i8.p0v8i8
				; TAILPRED-NOT: load <16 x i8>, <16 x i8>*
				%3 = load i8, i8* @Input, align 8, !tbaa !0
				%4 = sext i32 %1 to i64
				%5 = icmp eq i32 %1, 0
				br i1 %5, label %._crit_edge, label %.preheader47.preheader

				.preheader47.preheader: ; preds = %2
				br label %.preheader47

				.preheader47: ; preds = %.preheader47.preheader, %53
				%.050 = phi i64 [ %54, %53 ], [ 0, %.preheader47.preheader ]
				br label %.preheader

				._crit_edge.loopexit: ; preds = %53
				br label %._crit_edge

				._crit_edge: ; preds = %._crit_edge.loopexit, %2
				ret i32 0

				.preheader: ; preds = %52, %.preheader47
				%indvars.iv51 = phi i32 [ 1, %.preheader47 ], [ %indvars.iv.next52, %52 ]
				%6 = mul nuw nsw i32 %indvars.iv51, 320
				br label %7

				7: ; preds = %7, %.preheader
				%indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ]
				%8 = add nuw nsw i32 %6, %indvars.iv
				%9 = add nsw i32 %8, -320
				%10 = add nsw i32 %8, -321
				%11 = getelementptr inbounds i8, i8* %3, i32 %10
				%12 = load i8, i8* %11, align 1, !tbaa !4
				%13 = sext i8 %12 to i32
				%14 = getelementptr inbounds i8, i8* %3, i32 %9
				%15 = load i8, i8* %14, align 1, !tbaa !4
				%16 = sext i8 %15 to i32
				%17 = add nsw i32 %8, -319
				%18 = getelementptr inbounds i8, i8* %3, i32 %17
				%19 = load i8, i8* %18, align 1, !tbaa !4
				%20 = sext i8 %19 to i32
				%21 = add nsw i32 %8, -1
				%22 = getelementptr inbounds i8, i8* %3, i32 %21
				%23 = load i8, i8* %22, align 1, !tbaa !4
				%24 = sext i8 %23 to i32
				%25 = getelementptr inbounds i8, i8* %3, i32 %8
				%26 = load i8, i8* %25, align 1, !tbaa !4
				%27 = sext i8 %26 to i32
				%28 = mul nsw i32 %27, 255
				%29 = add nuw nsw i32 %8, 1
				%30 = getelementptr inbounds i8, i8* %3, i32 %29
				%31 = load i8, i8* %30, align 1, !tbaa !4
				%32 = sext i8 %31 to i32
				%33 = add nuw nsw i32 %8, 320
				%34 = add nuw nsw i32 %8, 319
				%35 = getelementptr inbounds i8, i8* %3, i32 %34
				%36 = load i8, i8* %35, align 1, !tbaa !4
				%37 = sext i8 %36 to i32
				%38 = getelementptr inbounds i8, i8* %3, i32 %33
				%39 = load i8, i8* %38, align 1, !tbaa !4
				%40 = sext i8 %39 to i32
				%41 = add nuw nsw i32 %8, 321
				%42 = getelementptr inbounds i8, i8* %3, i32 %41
				%43 = load i8, i8* %42, align 1, !tbaa !4
				%44 = sext i8 %43 to i32
				%reass.add = add nsw i32 %16, %13
				%reass.add44 = add nsw i32 %reass.add, %20
				%reass.add45 = add nsw i32 %reass.add44, %24
				%45 = add nsw i32 %reass.add45, %32
				%46 = add nsw i32 %45, %37
				%47 = add nsw i32 %46, %40
				%reass.add46 = add nsw i32 %47, %44
				%reass.mul = mul nsw i32 %reass.add46, -28
				%48 = add nsw i32 %reass.mul, %28
				%49 = lshr i32 %48, 8
				%50 = trunc i32 %49 to i8
				%51 = getelementptr inbounds i8, i8* %0, i32 %8
				store i8 %50, i8* %51, align 1, !tbaa !4
				%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
				%exitcond = icmp eq i32 %indvars.iv.next, 319
				br i1 %exitcond, label %52, label %7

				52: ; preds = %7
				%indvars.iv.next52 = add nuw nsw i32 %indvars.iv51, 1
				%exitcond53 = icmp eq i32 %indvars.iv.next52, 239
				br i1 %exitcond53, label %53, label %.preheader

				53: ; preds = %52
				%54 = add nuw i64 %.050, 1
				%55 = icmp ult i64 %54, %4
				br i1 %55, label %.preheader47, label %._crit_edge.loopexit
				}

				attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="false" "use-soft-float"="false" }
				dmgreenAuthorUnsubmitted Done Reply Inline Actions Can you try and cleanup the test. dmgreen: Can you try and cleanup the test.

				!0 = !{!1, !1, i64 0}
				!1 = !{!"any pointer", !2, i64 0}
				!2 = !{!"omnipotent char", !3, i64 0}
				!3 = !{!"Simple C++ TBAA"}
				!4 = !{!2, !2, i64 0}

This is an archive of the discontinued LLVM Phabricator instance.

[Target][ARM] Tune getCastInstrCost for extending masked loads and truncating masked stores
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263136

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/test/Analysis/CostModel/ARM/cast.ll

llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll

This is an archive of the discontinued LLVM Phabricator instance.

[Target][ARM] Tune getCastInstrCost for extending masked loads and truncating masked storesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263136

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/test/Analysis/CostModel/ARM/cast.ll

llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll

[Target][ARM] Tune getCastInstrCost for extending masked loads and truncating masked stores
ClosedPublic