Diff 446534

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
class GlobalValue;		class GlobalValue;
class InstCombiner;		class InstCombiner;
class OptimizationRemarkEmitter;		class OptimizationRemarkEmitter;
class IntrinsicInst;		class IntrinsicInst;
class LoadInst;		class LoadInst;
class LoopAccessInfo;		class LoopAccessInfo;
class Loop;		class Loop;
class LoopInfo;		class LoopInfo;
		class LoopVectorizationLegality;
class ProfileSummaryInfo;		class ProfileSummaryInfo;
class RecurrenceDescriptor;		class RecurrenceDescriptor;
class SCEV;		class SCEV;
class ScalarEvolution;		class ScalarEvolution;
class StoreInst;		class StoreInst;
class SwitchInst;		class SwitchInst;
class TargetLibraryInfo;		class TargetLibraryInfo;
class Type;		class Type;
▲ Show 20 Lines • Show All 462 Lines • ▼ Show 20 Lines	bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *LibInfo,		AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) const;		HardwareLoopInfo &HWLoopInfo) const;

/// Query the target whether it would be prefered to create a predicated		/// Query the target whether it would be prefered to create a predicated
/// vector loop, which can avoid the need to emit a scalar epilogue loop.		/// vector loop, which can avoid the need to emit a scalar epilogue loop.
bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,		bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *TLI,		AssumptionCache &AC, TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI) const;		LoopVectorizationLegality *LVL) const;

/// Query the target whether lowering of the llvm.get.active.lane.mask		/// Query the target whether lowering of the llvm.get.active.lane.mask
/// intrinsic is supported and how the mask should be used. A return value		/// intrinsic is supported and how the mask should be used. A return value
/// of PredicationStyle::Data indicates the mask is used as data only,		/// of PredicationStyle::Data indicates the mask is used as data only,
/// whereas PredicationStyle::DataAndControlFlow indicates we should also use		/// whereas PredicationStyle::DataAndControlFlow indicates we should also use
/// the mask for control flow in the loop. If unsupported the return value is		/// the mask for control flow in the loop. If unsupported the return value is
/// PredicationStyle::None.		/// PredicationStyle::None.
PredicationStyle emitGetActiveLaneMask() const;		PredicationStyle emitGetActiveLaneMask() const;
▲ Show 20 Lines • Show All 1,008 Lines • ▼ Show 20 Lines	virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
UnrollingPreferences &UP,		UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) = 0;		OptimizationRemarkEmitter *ORE) = 0;
virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE,		virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
PeelingPreferences &PP) = 0;		PeelingPreferences &PP) = 0;
virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,		virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,		AssumptionCache &AC,
TargetLibraryInfo *LibInfo,		TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) = 0;		HardwareLoopInfo &HWLoopInfo) = 0;
virtual bool		virtual bool preferPredicateOverEpilogue(Loop L, LoopInfo LI,
preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,		ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *TLI,		AssumptionCache &AC,
DominatorTree DT, const LoopAccessInfo LAI) = 0;		TargetLibraryInfo *TLI,
		DominatorTree *DT,
		LoopVectorizationLegality *LVL) = 0;
virtual PredicationStyle emitGetActiveLaneMask() = 0;		virtual PredicationStyle emitGetActiveLaneMask() = 0;
virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,		virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) = 0;		IntrinsicInst &II) = 0;
virtual Optional<Value *>		virtual Optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,		simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
APInt DemandedMask, KnownBits &Known,		APInt DemandedMask, KnownBits &Known,
bool &KnownBitsComputed) = 0;		bool &KnownBitsComputed) = 0;
virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(		virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
▲ Show 20 Lines • Show All 360 Lines • ▼ Show 20 Lines	public:
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,		bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *LibInfo,		AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) override {		HardwareLoopInfo &HWLoopInfo) override {
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);		return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}		}
bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,		bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *TLI,		AssumptionCache &AC, TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI) override {		LoopVectorizationLegality *LVL) override {
return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);		return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
}		}
PredicationStyle emitGetActiveLaneMask() override {		PredicationStyle emitGetActiveLaneMask() override {
return Impl.emitGetActiveLaneMask();		return Impl.emitGetActiveLaneMask();
}		}
Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,		Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) override {		IntrinsicInst &II) override {
return Impl.instCombineIntrinsic(IC, II);		return Impl.instCombineIntrinsic(IC, II);
}		}
▲ Show 20 Lines • Show All 628 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *LibInfo,		AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) const {		HardwareLoopInfo &HWLoopInfo) const {
return false;		return false;
}		}

bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,		bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *TLI,		AssumptionCache &AC, TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI) const {		LoopVectorizationLegality *LVL) const {
return false;		return false;
}		}

PredicationStyle emitGetActiveLaneMask() const {		PredicationStyle emitGetActiveLaneMask() const {
return PredicationStyle::None;		return PredicationStyle::None;
}		}

Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,		Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
▲ Show 20 Lines • Show All 1,098 Lines • Show Last 20 Lines

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 597 Lines • ▼ Show 20 Lines	bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
TargetLibraryInfo *LibInfo,		TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) {		HardwareLoopInfo &HWLoopInfo) {
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);		return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}		}

bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,		bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *TLI,		AssumptionCache &AC, TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI) {		LoopVectorizationLegality *LVL) {
return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);		return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
}		}

PredicationStyle emitGetActiveLaneMask() {		PredicationStyle emitGetActiveLaneMask() {
return BaseT::emitGetActiveLaneMask();		return BaseT::emitGetActiveLaneMask();
}		}

Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,		Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) {		IntrinsicInst &II) {
▲ Show 20 Lines • Show All 1,747 Lines • Show Last 20 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 288 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::isHardwareLoopProfitable(
Loop *L, ScalarEvolution &SE, AssumptionCache &AC,		Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {		TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);		return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}		}

bool TargetTransformInfo::preferPredicateOverEpilogue(		bool TargetTransformInfo::preferPredicateOverEpilogue(
Loop L, LoopInfo LI, ScalarEvolution &SE, AssumptionCache &AC,		Loop L, LoopInfo LI, ScalarEvolution &SE, AssumptionCache &AC,
TargetLibraryInfo TLI, DominatorTree DT,		TargetLibraryInfo TLI, DominatorTree DT,
const LoopAccessInfo *LAI) const {		LoopVectorizationLegality *LVL) const {
return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);		return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
}		}

PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const {		PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const {
return TTIImpl->emitGetActiveLaneMask();		return TTIImpl->emitGetActiveLaneMask();
}		}

Optional<Instruction *>		Optional<Instruction *>
TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC,		TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC,
▲ Show 20 Lines • Show All 928 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 334 Lines • ▼ Show 20 Lines	public:
}		}

PredicationStyle emitGetActiveLaneMask() const {		PredicationStyle emitGetActiveLaneMask() const {
if (ST->hasSVE())		if (ST->hasSVE())
return PredicationStyle::DataAndControlFlow;		return PredicationStyle::DataAndControlFlow;
return PredicationStyle::None;		return PredicationStyle::None;
}		}

		bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,
		AssumptionCache &AC, TargetLibraryInfo *TLI,
		DominatorTree *DT,
		LoopVectorizationLegality *LVL);

bool supportsScalableVectors() const { return ST->hasSVE(); }		bool supportsScalableVectors() const { return ST->hasSVE(); }

bool enableScalableVectorization() const { return ST->hasSVE(); }		bool enableScalableVectorization() const { return ST->hasSVE(); }

bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,		bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
ElementCount VF) const;		ElementCount VF) const;

bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,		bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
Show All 18 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show All 16 Lines
#include "llvm/CodeGen/CostTable.h"		#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"		#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"		#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"		#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"		#include "llvm/Transforms/InstCombine/InstCombiner.h"
		#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>		#include <algorithm>
using namespace llvm;		using namespace llvm;
using namespace llvm::PatternMatch;		using namespace llvm::PatternMatch;

#define DEBUG_TYPE "aarch64tti"		#define DEBUG_TYPE "aarch64tti"

static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",		static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
cl::init(true), cl::Hidden);		cl::init(true), cl::Hidden);

static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),		static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
cl::Hidden);		cl::Hidden);

static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",		static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
cl::init(10), cl::Hidden);		cl::init(10), cl::Hidden);

		class TailFoldingKind {
		private:
		uint8_t Bits = 0; // Currently defaults to disabled.

		public:
		enum TailFoldingOpts {
		TFDisabled = 0x0,
		paulwalker-armUnsubmitted Done Reply Inline Actions I think you want a couple more cases here: `default` for whatever is used when no option is provided `all` instead of `enabled` `simple` for the styles which currently have no name. I'm not that bothered about the name used, simple was just the first thing that popped into my head and isn't particularly great. My thinking is that if the user wants to enable a non-default case they shouldn't need to know what the default is. Likewise you should be able to be explicit when enabling a subset (i.e. you shouldn't need to guess at what needs to be disable and so can just start by disabling all and then adding the cases they care about. paulwalker-arm: I think you want a couple more cases here: * `default` for whatever is used when no option is…
		TFReductions = 0x01,
		paulwalker-armUnsubmitted Done Reply Inline Actions This doesn't really scale. What happens when there's another reason to allow the user to control tail predication? You'll need to add `EnabledNoBlob2`, then `EnabledNoReductionOrBlob2`..... Is it worth adding a custom parsing class (assuming something doesn't already exist) so that we can do `-sve-tail-predication` for the default and then allow users to add a comma separated list of <option>s to enable or no-<option> to disable, along with your existing `disabled`/`enabled` options. I'm not asking you to add new reasons to disable, only to make it easier to add then if necessary. That said, with this approach you could split the reduction and first order recurrences. By extension I'm suggesting TailPredication wants to be a bit field like enum with `Disabled=0` and `Enabled=AllOnes`. paulwalker-arm: This doesn't really scale. What happens when there's another reason to allow the user to…
		david-armAuthorUnsubmitted Done Reply Inline Actions Good suggestion @paulwalker-arm - I've had a go at doing this! david-arm: Good suggestion @paulwalker-arm - I've had a go at doing this!
		TFRecurrences = 0x02,
		TFSimple = 0x80,
		TFAll = TFReductions \| TFRecurrences \| TFSimple
		};

		void operator=(const std::string &Val) {
		if (Val.empty())
		return;
		SmallVector<StringRef, 6> TailFoldTypes;
		StringRef(Val).split(TailFoldTypes, '+', -1, false);
		for (auto TailFoldType : TailFoldTypes) {
		if (TailFoldType == "disabled")
		Bits = 0;
		else if (TailFoldType == "all")
		Bits = TFAll;
		else if (TailFoldType == "default")
		Bits = 0; // Currently defaults to never tail-folding.
		else if (TailFoldType == "simple")
		add(TFSimple);
		paulwalker-armUnsubmitted Done Reply Inline Actions Can this be `add(TFSimple)` so it's not position dependent? paulwalker-arm: Can this be `add(TFSimple)` so it's not position dependent?
		else if (TailFoldType == "reductions")
		add(TFReductions);
		else if (TailFoldType == "recurrences")
		add(TFRecurrences);
		else if (TailFoldType == "noreductions")
		remove(TFReductions);
		else if (TailFoldType == "norecurrences")
		remove(TFRecurrences);
		else {
		errs()
		<< "invalid argument " << TailFoldType.str()
		<< " to -sve-tail-folding=; each element must be one of: disabled, "
		"all, default, simple, reductions, noreductions, recurrences, "
		"norecurrences\n";
		}
		}
		}

		operator uint8_t() const { return Bits; }

		void add(uint8_t Flag) { Bits \|= Flag; }
		void remove(uint8_t Flag) { Bits &= ~Flag; }
		};

		TailFoldingKind TailFoldingKindLoc;

		cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
		"sve-tail-folding",
		cl::desc(
		"Control the use of vectorisation using tail-folding for SVE:"
		"\ndisabled No loop types will vectorize using tail-folding"
		"\ndefault Uses the default tail-folding settings for the target "
		"CPU"
		"\nall All legal loop types will vectorize using tail-folding"
		"\nsimple Use tail-folding for simple loops (not reductions or "
		"recurrences)"
		"\nreductions Use tail-folding for loops containing reductions"
		"\nrecurrences Use tail-folding for loops containing first order "
		"recurrences"),
		cl::location(TailFoldingKindLoc));

bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,		bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {		const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();		const TargetMachine &TM = getTLI()->getTargetMachine();

const FeatureBitset &CallerBits =		const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();		TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =		const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();		TM.getSubtargetImpl(*Callee)->getFeatureBits();
▲ Show 20 Lines • Show All 2,902 Lines • ▼ Show 20 Lines	if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();		int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)		if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;		return SubLT.first;
}		}
}		}

return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);		return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}		}

		bool AArch64TTIImpl::preferPredicateOverEpilogue(
		Loop L, LoopInfo LI, ScalarEvolution &SE, AssumptionCache &AC,
		TargetLibraryInfo TLI, DominatorTree DT, LoopVectorizationLegality *LVL) {
		if (!ST->hasSVE() \|\| TailFoldingKindLoc == TailFoldingKind::TFDisabled)
		return false;

		TailFoldingKind Required; // Defaults to 0.
		if (LVL->getReductionVars().size())
		Required.add(TailFoldingKind::TFReductions);
		if (LVL->getFirstOrderRecurrences().size())
		Required.add(TailFoldingKind::TFRecurrences);
		if (!Required)
		Required.add(TailFoldingKind::TFSimple);

		return (TailFoldingKindLoc & Required) == Required;
		paulwalker-armUnsubmitted Done Reply Inline Actions I don't think this works now we have the expanded bitfield. I think you need logic like: TailFoldingKind Required = 0; if (LVL->getReductionVars().size()) Required.add(TailFoldingKind::TFReductions) if (LVL->getReductionVars().size()) Required.add(TailFoldingKind::TFRecurrences) if (!Required) Required.add(TailFoldingKind::TFSimple) return TailFoldingKindLoc & Required paulwalker-arm: I don't think this works now we have the expanded bitfield. I think you need logic like: ```…
		david-armAuthorUnsubmitted Done Reply Inline Actions Hmm, the existing code I have does work since the tests I added all pass so I don't believe there is a bug. With the existing version if the user didn't request reductions and the loop contains at least one reduction then `!(TailFoldingKindLoc & TailFoldingKind::TFReductions) && LVL->getReductionVars().size()` is true and so we return false. This is the same as your suggested code because `Required != TailFoldingKindLoc` in that case. We then return true at the end if either: a) the loop is 'simple', or b) the user has explicitly permitted tail-folding with reductions and/or recurrences. Having said that, I'm happy to give your suggestion a try if you think it reads better! david-arm: Hmm, the existing code I have does work since the tests I added all pass so I don't believe…
		paulwalker-armUnsubmitted Done Reply Inline Actions If `TailFoldingKind::TFReductions` is set and the loop contains no reduction then tail folding will be enabled regardless of whether `TailFoldingKind::TFSimple` is set. Which I think is wrong? paulwalker-arm: If `TailFoldingKind::TFReductions` is set and the loop contains no reduction then tail folding…
		david-armAuthorUnsubmitted Done Reply Inline Actions Oh I see what you mean now. In which case this patch is missing a test. :) david-arm: Oh I see what you mean now. In which case this patch is missing a test. :)
		}

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

Show First 20 Lines • Show All 282 Lines • ▼ Show 20 Lines	InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);		TTI::TargetCostKind CostKind);

bool maybeLoweredToCall(Instruction &I);		bool maybeLoweredToCall(Instruction &I);
bool isLoweredToCall(const Function *F);		bool isLoweredToCall(const Function *F);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,		bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,		AssumptionCache &AC,
TargetLibraryInfo *LibInfo,		TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);		HardwareLoopInfo &HWLoopInfo);
bool preferPredicateOverEpilogue(Loop L, LoopInfo LI,		bool preferPredicateOverEpilogue(Loop L, LoopInfo LI, ScalarEvolution &SE,
ScalarEvolution &SE,		AssumptionCache &AC, TargetLibraryInfo *TLI,
AssumptionCache &AC,
TargetLibraryInfo *TLI,
DominatorTree *DT,		DominatorTree *DT,
const LoopAccessInfo *LAI);		LoopVectorizationLegality *LVL);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,		TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE);		OptimizationRemarkEmitter *ORE);

PredicationStyle emitGetActiveLaneMask() const;		PredicationStyle emitGetActiveLaneMask() const;

void getPeelingPreferences(Loop *L, ScalarEvolution &SE,		void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);		TTI::PeelingPreferences &PP);
▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Show All 14 Lines
#include "llvm/CodeGen/CostTable.h"		#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/ISDOpcodes.h"		#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"		#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"		#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"		#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsARM.h"		#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"		#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"		#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"		#include "llvm/Support/Casting.h"
#include "llvm/Support/KnownBits.h"		#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"		#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"		#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"		#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/Local.h"		#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"		#include "llvm/Transforms/Utils/LoopUtils.h"
		#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>		#include <algorithm>
#include <cassert>		#include <cassert>
#include <cstdint>		#include <cstdint>
#include <utility>		#include <utility>

using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "armtti"		#define DEBUG_TYPE "armtti"
▲ Show 20 Lines • Show All 2,148 Lines • ▼ Show 20 Lines	for (Instruction &I : BB->instructionsWithoutDebug()) {
}		}
}		}
}		}

LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");		LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
return true;		return true;
}		}

bool ARMTTIImpl::preferPredicateOverEpilogue(Loop L, LoopInfo LI,		bool ARMTTIImpl::preferPredicateOverEpilogue(
ScalarEvolution &SE,		Loop L, LoopInfo LI, ScalarEvolution &SE, AssumptionCache &AC,
AssumptionCache &AC,		TargetLibraryInfo TLI, DominatorTree DT, LoopVectorizationLegality *LVL) {
TargetLibraryInfo *TLI,
DominatorTree *DT,
const LoopAccessInfo *LAI) {
if (!EnableTailPredication) {		if (!EnableTailPredication) {
LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");		LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
return false;		return false;
}		}

// Creating a predicated vector loop is the first step for generating a		// Creating a predicated vector loop is the first step for generating a
// tail-predicated hardware loop, for which we need the MVE masked		// tail-predicated hardware loop, for which we need the MVE masked
// load/stores instructions:		// load/stores instructions:
Show All 25 Lines	bool ARMTTIImpl::preferPredicateOverEpilogue(
}		}

if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT)) {		if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "		LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n");		"a candidate.\n");
return false;		return false;
}		}

return canTailPredicateLoop(L, LI, SE, DL, LAI);		return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
}		}

PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {		PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
if (!ST->hasMVEIntegerOps() \|\| !EnableTailPredication)		if (!ST->hasMVEIntegerOps() \|\| !EnableTailPredication)
return PredicationStyle::None;		return PredicationStyle::None;

// Intrinsic @llvm.get.active.lane.mask is supported.		// Intrinsic @llvm.get.active.lane.mask is supported.
// It is used in the MVETailPredication pass, which requires the number of		// It is used in the MVETailPredication pass, which requires the number of
▲ Show 20 Lines • Show All 132 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,739 Lines • ▼ Show 20 Lines	static ScalarEpilogueLowering getScalarEpilogueLowering(
switch (Hints.getPredicate()) {		switch (Hints.getPredicate()) {
case LoopVectorizeHints::FK_Enabled:		case LoopVectorizeHints::FK_Enabled:
return CM_ScalarEpilogueNotNeededUsePredicate;		return CM_ScalarEpilogueNotNeededUsePredicate;
case LoopVectorizeHints::FK_Disabled:		case LoopVectorizeHints::FK_Disabled:
return CM_ScalarEpilogueAllowed;		return CM_ScalarEpilogueAllowed;
};		};

// 4) if the TTI hook indicates this is profitable, request predication.		// 4) if the TTI hook indicates this is profitable, request predication.
if (TTI->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT,		if (TTI->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, &LVL))
LVL.getLAI()))
return CM_ScalarEpilogueNotNeededUsePredicate;		return CM_ScalarEpilogueNotNeededUsePredicate;

return CM_ScalarEpilogueAllowed;		return CM_ScalarEpilogueAllowed;
}		}

Value VPTransformState::get(VPValue Def, unsigned Part) {		Value VPTransformState::get(VPValue Def, unsigned Part) {
// If Values have been set for this Def return the one relevant for \p Part.		// If Values have been set for this Def return the one relevant for \p Part.
if (hasVectorValue(Def, Part))		if (hasVectorValue(Def, Part))
▲ Show 20 Lines • Show All 802 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll

This file was added.

				; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S \| FileCheck %s -check-prefix=CHECK-NOTF
				; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S \| FileCheck %s -check-prefix=CHECK-NOTF
				; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S \| FileCheck %s -check-prefix=CHECK-TF
				paulwalker-armUnsubmitted Done Reply Inline Actions Is it worth having a version of this written as `-sve-tail-folding=disabled+simple+reductions+recurrences`, where `simple` is just whatever you decide to call the set which have no name. I don't think we need all combinations but having one line which enables everything manually will ensure we've no holes or typos. If you agree with adding a `default` option then we should test that as well because those are the check lines that will change as we support more cases. paulwalker-arm: Is it worth having a version of this written as `-sve-tail…
				; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S \| FileCheck %s -check-prefix=CHECK-TF
				; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S \| FileCheck %s -check-prefix=CHECK-TF-NORED
				; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S \| FileCheck %s -check-prefix=CHECK-TF-NOREC
				; RUN: opt < %s -loop-vectorize -sve-tail-folding=reductions -S \| FileCheck %s -check-prefix=CHECK-TF-ONLYRED

				target triple = "aarch64-unknown-linux-gnu"

				define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
				; CHECK-NOTF-LABEL: @simple_memset(
				; CHECK-NOTF: vector.ph:
				; CHECK-NOTF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
				; CHECK-NOTF: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
				; CHECK-NOTF: vector.body:
				; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-NOTF: store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*

				; CHECK-TF-NORED-LABEL: @simple_memset(
				; CHECK-TF-NORED: vector.ph:
				; CHECK-TF-NORED: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
				; CHECK-TF-NORED: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
				; CHECK-TF-NORED: vector.body:
				; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]

				; CHECK-TF-NOREC-LABEL: @simple_memset(
				; CHECK-TF-NOREC: vector.ph:
				; CHECK-TF-NOREC: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
				; CHECK-TF-NOREC: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
				; CHECK-TF-NOREC: vector.body:
				; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF-NOREC: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]

				; CHECK-TF-LABEL: @simple_memset(
				; CHECK-TF: vector.ph:
				; CHECK-TF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
				; CHECK-TF: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
				; CHECK-TF: vector.body:
				; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]

				; CHECK-TF-ONLYRED-LABEL: @simple_memset(
				; CHECK-TF-ONLYRED: vector.ph:
				; CHECK-TF-ONLYRED: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
				; CHECK-TF-ONLYRED: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
				; CHECK-TF-ONLYRED: vector.body:
				; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*

				entry:
				br label %while.body

				while.body: ; preds = %while.body, %entry
				%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
				%gep = getelementptr i32, i32* %ptr, i64 %index
				store i32 %val, i32* %gep
				%index.next = add nsw i64 %index, 1
				%cmp10 = icmp ult i64 %index.next, %n
				br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0

				while.end.loopexit: ; preds = %while.body
				ret void
				}

				define float @fadd_red_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
				; CHECK-NOTF-LABEL: @fadd_red_fast
				; CHECK-NOTF: vector.body:
				; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-NOTF: %[[LOAD:.*]] = load <vscale x 4 x float>
				; CHECK-NOTF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
				; CHECK-NOTF: middle.block:
				; CHECK-NOTF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])

				; CHECK-TF-NORED-LABEL: @fadd_red_fast
				; CHECK-TF-NORED: vector.body:
				; CHECK-TF-NORED-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-TF-NORED: %[[LOAD:.*]] = load <vscale x 4 x float>
				; CHECK-TF-NORED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
				; CHECK-TF-NORED: middle.block:
				; CHECK-TF-NORED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])

				; CHECK-TF-NOREC-LABEL: @fadd_red_fast
				; CHECK-TF-NOREC: vector.body:
				; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF-NOREC: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
				; CHECK-TF-NOREC: %[[LOAD:.]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.}} %[[ACTIVE_LANE_MASK]]
				; CHECK-TF-NOREC: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
				; CHECK-TF-NOREC: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
				; CHECK-TF-NOREC: middle.block:
				; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])

				; CHECK-TF-LABEL: @fadd_red_fast
				; CHECK-TF: vector.body:
				; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
				; CHECK-TF: %[[LOAD:.]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.}} %[[ACTIVE_LANE_MASK]]
				; CHECK-TF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
				; CHECK-TF: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
				; CHECK-TF: middle.block:
				; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])

				; CHECK-TF-ONLYRED-LABEL: @fadd_red_fast
				; CHECK-TF-ONLYRED: vector.body:
				; CHECK-TF-ONLYRED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF-ONLYRED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
				; CHECK-TF-ONLYRED: %[[LOAD:.]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.}} %[[ACTIVE_LANE_MASK]]
				; CHECK-TF-ONLYRED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
				; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
				; CHECK-TF-ONLYRED: middle.block:
				; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
				entry:
				br label %for.body

				for.body:
				%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
				%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
				%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
				%0 = load float, float* %arrayidx, align 4
				%add = fadd fast float %0, %sum.07
				%iv.next = add nuw nsw i64 %iv, 1
				%exitcond.not = icmp eq i64 %iv.next, %n
				br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0

				for.end:
				ret float %add
				}

				define void @add_recur(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
				; CHECK-NOTF-LABEL: @add_recur
				; CHECK-NOTF: entry:
				; CHECK-NOTF: %[[PRE:.]] = load i32, i32 %src, align 4
				; CHECK-NOTF: vector.ph:
				; CHECK-NOTF: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
				; CHECK-NOTF: vector.body:
				; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-NOTF: %[[VECTOR_RECUR:.]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.]], %vector.body ]
				; CHECK-NOTF: %[[LOAD]] = load <vscale x 4 x i32>
				; CHECK-NOTF: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
				; CHECK-NOTF: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
				; CHECK-NOTF: store <vscale x 4 x i32> %[[ADD]]

				; CHECK-TF-NORED-LABEL: @add_recur
				; CHECK-TF-NORED: entry:
				; CHECK-TF-NORED: %[[PRE:.]] = load i32, i32 %src, align 4
				; CHECK-TF-NORED: vector.ph:
				; CHECK-TF-NORED: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
				; CHECK-TF-NORED: vector.body:
				; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF-NORED: %[[VECTOR_RECUR:.]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.]], %vector.body ]
				; CHECK-TF-NORED: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
				; CHECK-TF-NORED: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
				; CHECK-TF-NORED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
				; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])

				; CHECK-TF-NOREC-LABEL: @add_recur
				; CHECK-TF-NOREC: entry:
				; CHECK-TF-NOREC: %[[PRE:.]] = load i32, i32 %src, align 4
				; CHECK-TF-NOREC: vector.ph:
				; CHECK-TF-NOREC: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
				; CHECK-TF-NOREC: vector.body:
				; CHECK-TF-NOREC-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-TF-NOREC: %[[VECTOR_RECUR:.]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.]], %vector.body ]
				; CHECK-TF-NOREC: %[[LOAD]] = load <vscale x 4 x i32>
				; CHECK-TF-NOREC: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
				; CHECK-TF-NOREC: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
				; CHECK-TF-NOREC: store <vscale x 4 x i32> %[[ADD]]

				; CHECK-TF-LABEL: @add_recur
				; CHECK-TF: entry:
				; CHECK-TF: %[[PRE:.]] = load i32, i32 %src, align 4
				; CHECK-TF: vector.ph:
				; CHECK-TF: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
				; CHECK-TF: vector.body:
				; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
				; CHECK-TF: %[[VECTOR_RECUR:.]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.]], %vector.body ]
				; CHECK-TF: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
				; CHECK-TF: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
				; CHECK-TF: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
				; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])

				; CHECK-TF-ONLYRED-LABEL: @add_recur
				; CHECK-TF-ONLYRED: entry:
				; CHECK-TF-ONLYRED: %[[PRE:.]] = load i32, i32 %src, align 4
				; CHECK-TF-ONLYRED: vector.ph:
				; CHECK-TF-ONLYRED: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
				; CHECK-TF-ONLYRED: vector.body:
				; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
				; CHECK-TF-ONLYRED: %[[VECTOR_RECUR:.]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.]], %vector.body ]
				; CHECK-TF-ONLYRED: %[[LOAD]] = load <vscale x 4 x i32>
				; CHECK-TF-ONLYRED: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
				; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
				; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[ADD]]

				entry:
				%.pre = load i32, i32* %src, align 4
				br label %for.body

				for.body: ; preds = %entry, %for.body
				%0 = phi i32 [ %1, %for.body ], [ %.pre, %entry ]
				%i.010 = phi i64 [ %add, %for.body ], [ 0, %entry ]
				%add = add nuw nsw i64 %i.010, 1
				%arrayidx1 = getelementptr inbounds i32, i32* %src, i64 %add
				%1 = load i32, i32* %arrayidx1, align 4
				%add2 = add nsw i32 %1, %0
				%arrayidx3 = getelementptr inbounds i32, i32* %dst, i64 %i.010
				store i32 %add2, i32* %arrayidx3, align 4
				%exitcond.not = icmp eq i64 %add, %n
				br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0

				for.end: ; preds = %for.body
				ret void
				}

				attributes #0 = { "target-features"="+sve" }

				!0 = distinct !{!0, !1, !2, !3, !4}
				!1 = !{!"llvm.loop.vectorize.width", i32 4}
				!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
				!3 = !{!"llvm.loop.interleave.count", i32 1}
				!4 = !{!"llvm.loop.vectorize.enable", i1 true}

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Add target hook for preferPredicateOverEpilogue
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 446534

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Add target hook for preferPredicateOverEpilogueClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 446534

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll

[AArch64] Add target hook for preferPredicateOverEpilogue
ClosedPublic