Diff 381174

llvm/lib/Target/AArch64/AArch64Subtarget.h

Show First 20 Lines • Show All 274 Lines • ▼ Show 20 Lines	protected:

// CustomCallUsedXRegister[i] - X#i call saved.		// CustomCallUsedXRegister[i] - X#i call saved.
BitVector CustomCallSavedXRegs;		BitVector CustomCallSavedXRegs;

bool IsLittle;		bool IsLittle;

unsigned MinSVEVectorSizeInBits;		unsigned MinSVEVectorSizeInBits;
unsigned MaxSVEVectorSizeInBits;		unsigned MaxSVEVectorSizeInBits;
		unsigned VScaleForTuning = 2;
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: This may be personal preference, but would `VScaleForCPU` be a more suitable name? sdesmalen: nit: This may be personal preference, but would `VScaleForCPU` be a more suitable name?
		david-armAuthorUnsubmitted Done Reply Inline Actions I don't mind really. I don't have a strong preference. I guess the reason I used VScaleForTuning was just to make it clear that this is not an indication of what the exact vscale will be at runtime, since it could be anything from 1 -> max vscale for the CPU. Rather it's just an indication of how we'd like to tune the vectoriser, optimisations and codegen. However, happy to change it if you prefer it! david-arm: I don't mind really. I don't have a strong preference. I guess the reason I used…
		sdesmalenUnsubmitted Not Done Reply Inline Actions Okay fair enough, maybe 'VScaleForTuning' is more appropriate. sdesmalen: Okay fair enough, maybe 'VScaleForTuning' is more appropriate.

/// TargetTriple - What processor and OS we're targeting.		/// TargetTriple - What processor and OS we're targeting.
Triple TargetTriple;		Triple TargetTriple;

AArch64FrameLowering FrameLowering;		AArch64FrameLowering FrameLowering;
AArch64InstrInfo InstrInfo;		AArch64InstrInfo InstrInfo;
AArch64SelectionDAGInfo TSInfo;		AArch64SelectionDAGInfo TSInfo;
AArch64TargetLowering TLInfo;		AArch64TargetLowering TLInfo;
▲ Show 20 Lines • Show All 359 Lines • ▼ Show 20 Lines	public:
}		}

unsigned getMinSVEVectorSizeInBits() const {		unsigned getMinSVEVectorSizeInBits() const {
assert(HasSVE && "Tried to get SVE vector length without SVE support!");		assert(HasSVE && "Tried to get SVE vector length without SVE support!");
return MinSVEVectorSizeInBits;		return MinSVEVectorSizeInBits;
}		}

bool useSVEForFixedLengthVectors() const;		bool useSVEForFixedLengthVectors() const;

		unsigned getVScaleForTuning() const { return VScaleForTuning; }
};		};
} // End llvm namespace		} // End llvm namespace

#endif		#endif

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	case Others:
break;		break;
case Carmel:		case Carmel:
CacheLineSize = 64;		CacheLineSize = 64;
break;		break;
case CortexA35:		case CortexA35:
break;		break;
case CortexA53:		case CortexA53:
case CortexA55:		case CortexA55:
		PrefFunctionLogAlignment = 4;
		break;
case CortexA510:		case CortexA510:
PrefFunctionLogAlignment = 4;		PrefFunctionLogAlignment = 4;
		VScaleForTuning = 1;
break;		break;
case CortexA57:		case CortexA57:
MaxInterleaveFactor = 4;		MaxInterleaveFactor = 4;
PrefFunctionLogAlignment = 4;		PrefFunctionLogAlignment = 4;
break;		break;
case CortexA65:		case CortexA65:
PrefFunctionLogAlignment = 3;		PrefFunctionLogAlignment = 3;
break;		break;
Show All 11 Lines	void AArch64Subtarget::initializeProperties() {
case A64FX:		case A64FX:
CacheLineSize = 256;		CacheLineSize = 256;
PrefFunctionLogAlignment = 3;		PrefFunctionLogAlignment = 3;
PrefLoopLogAlignment = 2;		PrefLoopLogAlignment = 2;
MaxInterleaveFactor = 4;		MaxInterleaveFactor = 4;
PrefetchDistance = 128;		PrefetchDistance = 128;
MinPrefetchStride = 1024;		MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;		MaxPrefetchIterationsAhead = 4;
		VScaleForTuning = 4;
break;		break;
case AppleA7:		case AppleA7:
case AppleA10:		case AppleA10:
case AppleA11:		case AppleA11:
case AppleA12:		case AppleA12:
case AppleA13:		case AppleA13:
case AppleA14:		case AppleA14:
CacheLineSize = 64;		CacheLineSize = 64;
Show All 25 Lines	case Kryo:
MaxPrefetchIterationsAhead = 11;		MaxPrefetchIterationsAhead = 11;
// FIXME: remove this to enable 64-bit SLP if performance looks good.		// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;		MinVectorRegisterBitWidth = 128;
break;		break;
case NeoverseE1:		case NeoverseE1:
PrefFunctionLogAlignment = 3;		PrefFunctionLogAlignment = 3;
break;		break;
case NeoverseN1:		case NeoverseN1:
		PrefFunctionLogAlignment = 4;
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: seems unrelated? sdesmalen: nit: seems unrelated?
		david-armAuthorUnsubmitted Done Reply Inline Actions I had to split this out as a separate case to avoid the fallthrough into the NeoverseN2 case that's all. david-arm: I had to split this out as a separate case to avoid the fallthrough into the NeoverseN2 case…
		break;
case NeoverseN2:		case NeoverseN2:
		PrefFunctionLogAlignment = 4;
		VScaleForTuning = 1;
		break;
case NeoverseV1:		case NeoverseV1:
PrefFunctionLogAlignment = 4;		PrefFunctionLogAlignment = 4;
		VScaleForTuning = 2;
break;		break;
case Saphira:		case Saphira:
MaxInterleaveFactor = 4;		MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.		// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;		MinVectorRegisterBitWidth = 128;
break;		break;
case ThunderX2T99:		case ThunderX2T99:
CacheLineSize = 64;		CacheLineSize = 64;
Show All 29 Lines	case ThunderX3T110:
PrefetchDistance = 128;		PrefetchDistance = 128;
MinPrefetchStride = 1024;		MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;		MaxPrefetchIterationsAhead = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.		// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;		MinVectorRegisterBitWidth = 128;
break;		break;
}		}
}		}

		dmgreenUnsubmitted Done Reply Inline Actions I don't think these should be needed, as the above ARMProcFamily should be based on TuneCPU after D110258/D111551. dmgreen: I don't think these should be needed, as the above ARMProcFamily should be based on TuneCPU…
		david-armAuthorUnsubmitted Done Reply Inline Actions Hi @dmgreen, I was quite worried that D111551 will hold up this patch and I don't really see why it has to, therefore I have kept them independent. I was thinking that once D111551 lands then I can fix up this code and use ARMProcFamily instead. We'd really like to get the cost model changes in asap because it's preventing vectorisation on machines with SVE hardware. I feel that D111551 is a nice thing to have, but not a requirement for the cost model. I also expect it will take a lot longer to get approved. :) david-arm: Hi @dmgreen, I was quite worried that D111551 will hold up this patch and I don't really see…
		dmgreenUnsubmitted Not Done Reply Inline Actions I'm not sure I see why we need to rush with a suboptimal solution. SVE autovec was not enabled, and this patch just enabled some extra gather/scatters and reductions. D111551 doesn't look like it should take too long either, but maybe I'm being optimistic there. If you want it to land sooner, you could just remove all the references to tune-cpu in this patch. That way it works with -mcpu, the same as every other tuning feature has in llvm for the last 10 year :) It should then automatically become a tuning features when those other two patches are done. dmgreen: I'm not sure I see why we need to rush with a suboptimal solution. SVE autovec was not enabled…
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,		AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &TuneCPU,		const std::string &TuneCPU,
const std::string &FS,		const std::string &FS,
const TargetMachine &TM, bool LittleEndian,		const TargetMachine &TM, bool LittleEndian,
unsigned MinSVEVectorSizeInBitsOverride,		unsigned MinSVEVectorSizeInBitsOverride,
unsigned MaxSVEVectorSizeInBitsOverride)		unsigned MaxSVEVectorSizeInBitsOverride)
: AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),		: AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),		ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines	if (!MFI.isMaxCallFrameSizeComputed())
MFI.computeMaxCallFrameSize(MF);		MFI.computeMaxCallFrameSize(MF);
}		}

bool AArch64Subtarget::useSVEForFixedLengthVectors() const {		bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
// Prefer NEON unless larger SVE registers are available.		// Prefer NEON unless larger SVE registers are available.
return hasSVE() && getMinSVEVectorSizeInBits() >= 256;		return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
}		}

bool AArch64Subtarget::useAA() const { return UseAA; }		bool AArch64Subtarget::useAA() const { return UseAA; }
		dmgreenUnsubmitted Not Done Reply Inline Actions These per-cpu values should probably be subtarget features or defined in AArch64Subtarget::initializeProperties like the other alignments and whatnot. dmgreen: These per-cpu values should probably be subtarget features or defined in AArch64Subtarget…
		david-armAuthorUnsubmitted Done Reply Inline Actions Hi @dmgreen, I think that's perfectly sensible when you only have a target-cpu feature. I originally tried doing it that way, but realised if target-cpu=generic and tune-cpu=neoverse-n1 then the subtarget won't have max vscale set, since we chose a generic subtarget. david-arm: Hi @dmgreen, I think that's perfectly sensible when you only have a target-cpu feature. I…
		dmgreenUnsubmitted Done Reply Inline Actions Tune-cpu has never worked in the AArch64 backend, as far as I understand. Like I said in the other patch the subtarget features do not distinguish between performance features and architecture features. Having one feature work off tune-cpu whereas everything else in the backend works on target-cpu sounds wrong to me, without fixing it properly and making -mtune work as you would expect across the board. Which would be great but until then I would base everything off the cpu for consistency. If you do want to make tune-cpu work, look at how the X86 backend does it. This shouldn't have to look into the Functions target-cpu and tune-cpu attributes, it should be initialized with the correct CPU and TuneCPU's passed in to the subtarget. dmgreen: Tune-cpu has never worked in the AArch64 backend, as far as I understand. Like I said in the…
		sdesmalenUnsubmitted Done Reply Inline Actions nit: redundant comment, the code says the same thing, just shorter :) sdesmalen: nit: redundant comment, the code says the same thing, just shorter :)
		sdesmalenUnsubmitted Done Reply Inline Actions F is already known to be != nullptr. For consistency, can you do the same as above here, get the attribute and then check whether it's valid (as opposed to checking whether the function has the attribute. sdesmalen: 1. F is already known to be != nullptr. 2. For consistency, can you do the same as above here…

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	unsigned getMinVectorRegisterBitWidth() const {
return ST->getMinVectorRegisterBitWidth();		return ST->getMinVectorRegisterBitWidth();
}		}


/// Try to return an estimate cost factor that can be used as a multiplier		/// Try to return an estimate cost factor that can be used as a multiplier
/// when scalarizing an operation for a vector with ElementCount \p VF.		/// when scalarizing an operation for a vector with ElementCount \p VF.
/// For scalable vectors this currently takes the most pessimistic view based		/// For scalable vectors this currently takes the most pessimistic view based
/// upon the maximum possible value for vscale.		/// upon the maximum possible value for vscale.
unsigned getMaxNumElements(ElementCount VF,		unsigned getMaxNumElements(ElementCount VF) const {
const Function *F = nullptr) const {
if (!VF.isScalable())		if (!VF.isScalable())
return VF.getFixedValue();		return VF.getFixedValue();

unsigned MaxNumVScale = 16;		return VF.getKnownMinValue() * ST->getVScaleForTuning();
if (F && F->hasFnAttribute(Attribute::VScaleRange)) {
unsigned VScaleMax =
F->getFnAttribute(Attribute::VScaleRange).getVScaleRangeArgs().second;
if (VScaleMax > 0)
MaxNumVScale = VScaleMax;
}

return MaxNumVScale * VF.getKnownMinValue();
}		}

unsigned getMaxInterleaveFactor(unsigned VF);		unsigned getMaxInterleaveFactor(unsigned VF);

InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,		InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,		Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind);		TTI::TargetCostKind CostKind);

▲ Show 20 Lines • Show All 181 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,626 Lines • ▼ Show 20 Lines	InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
// sufficiently reliable.		// sufficiently reliable.
if (cast<VectorType>(DataTy)->getElementCount() ==		if (cast<VectorType>(DataTy)->getElementCount() ==
ElementCount::getScalable(1))		ElementCount::getScalable(1))
return InstructionCost::getInvalid();		return InstructionCost::getInvalid();

ElementCount LegalVF = LT.second.getVectorElementCount();		ElementCount LegalVF = LT.second.getVectorElementCount();
InstructionCost MemOpCost =		InstructionCost MemOpCost =
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);		getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction());		return LT.first * MemOpCost * getMaxNumElements(LegalVF);
}		}

bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {		bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();		return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
}		}

InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,		InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
MaybeAlign Alignment,		MaybeAlign Alignment,
▲ Show 20 Lines • Show All 655 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AArch64/sve-gather.ll

	; Check getIntrinsicInstrCost in BasicTTIImpl.h for masked gather			; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
				; RUN: opt -analyze -cost-model < %s \| FileCheck %s
				; RUN: opt -analyze -cost-model -mcpu=neoverse-v1 < %s \| FileCheck %s --check-prefix=CHECK-VSCALE-2
				; RUN: opt -analyze -cost-model -mcpu=neoverse-n2 < %s \| FileCheck %s --check-prefix=CHECK-VSCALE-1
				; RUN: opt -analyze -cost-model -mcpu=cortex-a510 < %s \| FileCheck %s --check-prefix=CHECK-VSCALE-1

	; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s \| FileCheck %s			target triple="aarch64--linux-gnu"

	define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) vscale_range(0, 16) {			define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
	; CHECK-LABEL: 'masked_gathers'			; CHECK-LABEL: 'masked_gathers'
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32			; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %res.v4i32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32			; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res.v1i128 = call <1 x i128> @llvm.masked.gather.v1i128.v1p0i128			; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64			;
				; CHECK-VSCALE-2-LABEL: 'masked_gathers'
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
				; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-1-LABEL: 'masked_gathers'
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
				; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
	%res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)			%res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
	%res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)			%res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
	%res.v4i32 = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> undef, i32 0, <4 x i1> %v4i1mask, <4 x i32> zeroinitializer)			%res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
	%res.v1i128 = call <1 x i128> @llvm.masked.gather.v1i128.v1p0i128(<1 x i128*> undef, i32 0, <1 x i1> %v1i1mask, <1 x i128> zeroinitializer)
	%res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
	ret void			ret void
	}			}

	define void @masked_gathers_no_vscale_range() {			define void @masked_gathers_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
				; CHECK-LABEL: 'masked_gathers_tune_generic'
				; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
				; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
				; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
				; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-2-LABEL: 'masked_gathers_tune_generic'
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
				; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-1-LABEL: 'masked_gathers_tune_generic'
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
				; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				%res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
				%res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
				%res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
				ret void
				}

				define void @masked_gathers_no_vscale_range() #2 {
	; CHECK-LABEL: 'masked_gathers_no_vscale_range'			; CHECK-LABEL: 'masked_gathers_no_vscale_range'
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
				; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-2-LABEL: 'masked_gathers_no_vscale_range'
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-1-LABEL: 'masked_gathers_no_vscale_range'
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
	%res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)			%res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
	%res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)			%res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)

	%res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)			%res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
	%res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)			%res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
	%res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)			%res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)

	%res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)			%res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
	%res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)			%res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
	%res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)			%res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)

	ret void			ret void
	}			}

				attributes #0 = { "target-features"="+sve" vscale_range(0, 8) }
				attributes #1 = { "target-features"="+sve" vscale_range(0, 16) "tune-cpu"="generic" }
				attributes #2 = { "target-features"="+sve" }

	declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)			declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
	declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)			declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
	declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)			declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x i64*>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
	declare <1 x i128> @llvm.masked.gather.v1i128.v1p0i128(<1 x i128*>, i32, <1 x i1>, <1 x i128>)
	declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
	declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x double*>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)			declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x double*>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
	declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)			declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
	declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x float*>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)			declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x float*>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)
	declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)			declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
	declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)			declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
	declare <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x i16*>, i32, <vscale x 16 x i1>, <vscale x 16 x i16>)			declare <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x i16*>, i32, <vscale x 16 x i1>, <vscale x 16 x i16>)
	declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x i16*>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)			declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x i16*>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
	declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)			declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)

llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	;
%fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)		%fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
%fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)		%fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)

ret void		ret void
}		}

define void @strict_fp_reductions(<vscale x 4 x float> %v0, <vscale x 4 x double> %v1) {		define void @strict_fp_reductions(<vscale x 4 x float> %v0, <vscale x 4 x double> %v1) {
; CHECK-LABEL: 'strict_fp_reductions'		; CHECK-LABEL: 'strict_fp_reductions'
; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)		; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)		; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)		; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)		; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void		; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;		;
%fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> %v0)		%fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> %v0)
%fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> %v1)		%fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> %v1)
%fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.0, <vscale x 4 x float> %v0)		%fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.0, <vscale x 4 x float> %v0)
%fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.0, <vscale x 4 x double> %v1)		%fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.0, <vscale x 4 x double> %v1)
▲ Show 20 Lines • Show All 319 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll

	; Check getIntrinsicInstrCost in BasicTTIImpl.h with for masked scatter			; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
				; RUN: opt -analyze -cost-model < %s \| FileCheck %s
				; RUN: opt -analyze -cost-model -mcpu=neoverse-v1 < %s \| FileCheck %s --check-prefix=CHECK-VSCALE-2
				; RUN: opt -analyze -cost-model -mcpu=neoverse-n2 < %s \| FileCheck %s --check-prefix=CHECK-VSCALE-1
				; RUN: opt -analyze -cost-model -mcpu=cortex-a510 < %s \| FileCheck %s --check-prefix=CHECK-VSCALE-1

	; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s \| FileCheck %s			target triple="aarch64--linux-gnu"

	define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) vscale_range(0, 16) {			define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
	; CHECK-LABEL: 'masked_scatters'			; CHECK-LABEL: 'masked_scatters'
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32			; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32			; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v1i128.v1p0i128			; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
	; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64			;
				; CHECK-VSCALE-2-LABEL: 'masked_scatters'
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
				; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-1-LABEL: 'masked_scatters'
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
				; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
	call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)			call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
	call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)			call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
	call void @llvm.masked.scatter.v4i32(<4 x i32> undef, <4 x i32*> undef, i32 0, <4 x i1> %v4i1mask)			call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
	call void @llvm.masked.scatter.v1i128.v1p0i128(<1 x i128> undef, <1 x i128*> undef, i32 0, <1 x i1> %v1i1mask)
	call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
	ret void			ret void
	}			}

	define void @masked_scatters_no_vscale_range() {			define void @masked_scatters_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
				; CHECK-LABEL: 'masked_scatters_tune_generic'
				; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
				; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
				; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
				; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-2-LABEL: 'masked_scatters_tune_generic'
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
				; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-1-LABEL: 'masked_scatters_tune_generic'
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
				; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
				call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
				call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
				ret void
				}

				define void @masked_scatters_no_vscale_range() #2 {
	; CHECK-LABEL: 'masked_scatters_no_vscale_range'			; CHECK-LABEL: 'masked_scatters_no_vscale_range'
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
	; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)			; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-2-LABEL: 'masked_scatters_no_vscale_range'
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
				; CHECK-VSCALE-1-LABEL: 'masked_scatters_no_vscale_range'
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
				; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
				;
	call void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)			call void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
	call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)			call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)

	call void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)			call void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
	call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)			call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
	call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)			call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)

	call void @llvm.masked.scatter.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)			call void @llvm.masked.scatter.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
	call void @llvm.masked.scatter.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)			call void @llvm.masked.scatter.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
	call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)			call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)

	ret void			ret void
	}			}

				attributes #0 = { "target-features"="+sve" vscale_range(0, 8) }
				attributes #1 = { "target-features"="+sve" vscale_range(0, 16) "tune-cpu"="generic" }
				attributes #2 = { "target-features"="+sve" }

	declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)			declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
	declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32*>, i32, <vscale x 8 x i1>)			declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32*>, i32, <vscale x 8 x i1>)
	declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)			declare void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64*>, i32, <vscale x 1 x i1>)
	declare void @llvm.masked.scatter.v1i128.v1p0i128(<1 x i128>, <1 x i128*>, i32, <1 x i1>)
	declare void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64>, <vscale x 1 x i64*>, i32, <vscale x 1 x i1>)
	declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double*>, i32, <vscale x 4 x i1>)			declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double*>, i32, <vscale x 4 x i1>)
	declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)			declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
	declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float*>, i32, <vscale x 8 x i1>)			declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float*>, i32, <vscale x 8 x i1>)
	declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)			declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
	declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)			declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
	declare void @llvm.masked.scatter.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16*>, i32, <vscale x 16 x i1>)			declare void @llvm.masked.scatter.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16*>, i32, <vscale x 16 x i1>)
	declare void @llvm.masked.scatter.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16*>, i32, <vscale x 8 x i1>)			declare void @llvm.masked.scatter.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16*>, i32, <vscale x 8 x i1>)
	declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)			declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)

llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

	; REQUIRES: asserts			; REQUIRES: asserts
	; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \			; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \
	; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 \| FileCheck %s --check-prefix=CHECK-VF4			; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 \| FileCheck %s --check-prefix=CHECK-VF4
	; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \			; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \
	; RUN: -scalable-vectorization=on -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 \| FileCheck %s --check-prefix=CHECK-VF8			; RUN: -scalable-vectorization=on -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 \| FileCheck %s --check-prefix=CHECK-VF8
				; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \
				; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -mcpu=neoverse-n2 -S 2>&1 \| FileCheck %s --check-prefix=CHECK-VF4-CPU-NEOVERSE-N2

	target triple="aarch64-unknown-linux-gnu"			target triple="aarch64-unknown-linux-gnu"

	; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07			; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
	; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07			; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07
				; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07

	define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 {			define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 {
	entry:			entry:
	br label %for.body			br label %for.body

	for.body:			for.body:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]			%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
	%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]			%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
	%arrayidx = getelementptr inbounds float, float* %a, i64 %iv			%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
	%0 = load float, float* %arrayidx, align 4			%0 = load float, float* %arrayidx, align 4
	%add = fadd float %0, %sum.07			%add = fadd float %0, %sum.07
	%iv.next = add nuw nsw i64 %iv, 1			%iv.next = add nuw nsw i64 %iv, 1
	%exitcond.not = icmp eq i64 %iv.next, %n			%exitcond.not = icmp eq i64 %iv.next, %n
	br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0			br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0

	for.end:			for.end:
	ret float %add			ret float %add
	}			}


	; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07			; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07
	; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07			; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07
				; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07

	define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 {			define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 {
	entry:			entry:
	br label %for.body			br label %for.body

	for.body:			for.body:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]			%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
	%sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]			%sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
	Show All 15 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SVE][Analysis] Tune the cost model according to the tune-cpu attribute
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 381174

llvm/lib/Target/AArch64/AArch64Subtarget.h

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/test/Analysis/CostModel/AArch64/sve-gather.ll

llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll

llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll

llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SVE][Analysis] Tune the cost model according to the tune-cpu attributeClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 381174

llvm/lib/Target/AArch64/AArch64Subtarget.h

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/test/Analysis/CostModel/AArch64/sve-gather.ll

llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll

llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll

llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

[SVE][Analysis] Tune the cost model according to the tune-cpu attribute
ClosedPublic