Diff 224612

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {

friend BaseT;		friend BaseT;

Triple TargetTriple;		Triple TargetTriple;

const TargetSubtargetInfo *ST;		const TargetSubtargetInfo *ST;
const TargetLoweringBase *TLI;		const TargetLoweringBase *TLI;

		AMDGPUSubtarget::Generation Gen;
		arsenmUnsubmitted Done Reply Inline Actions You don't need to add this field. You already have the subtarget available here, you just need to change the type arsenm: You don't need to add this field. You already have the subtarget available here, you just need…
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions Good point. I didn't pay enough attention when I resolved the merge of my code - fixed. timcorringham: Good point. I didn't pay enough attention when I resolved the merge of my code - fixed.

const TargetSubtargetInfo *getST() const { return ST; }		const TargetSubtargetInfo *getST() const { return ST; }
const TargetLoweringBase *getTLI() const { return TLI; }		const TargetLoweringBase *getTLI() const { return TLI; }

public:		public:
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)		explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),		: BaseT(TM, F.getParent()->getDataLayout()),
TargetTriple(TM->getTargetTriple()),		TargetTriple(TM->getTargetTriple()),
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),		ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()) {}		TLI(ST->getTargetLowering()),
		Gen(TM->getSubtarget<GCNSubtarget>(F).getGeneration()) {}

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);		TTI::UnrollingPreferences &UP);
};		};

class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {		class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
using BaseT = BasicTTIImplBase<GCNTTIImpl>;		using BaseT = BasicTTIImplBase<GCNTTIImpl>;
using TTI = TargetTransformInfo;		using TTI = TargetTransformInfo;
▲ Show 20 Lines • Show All 193 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines

#define DEBUG_TYPE "AMDGPUtti"		#define DEBUG_TYPE "AMDGPUtti"

static cl::opt<unsigned> UnrollThresholdPrivate(		static cl::opt<unsigned> UnrollThresholdPrivate(
"amdgpu-unroll-threshold-private",		"amdgpu-unroll-threshold-private",
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),		cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
cl::init(2500), cl::Hidden);		cl::init(2500), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdLocal(		static cl::opt<unsigned> UnrollThresholdLocal(
		rampitecUnsubmitted Done Reply Inline Actions This change penalizes loops which should have unroll boosted instead. Your new default thresholds are now higher than boosted. rampitec: This change penalizes loops which should have unroll boosted instead. Your new default…
		nhaehnleUnsubmitted Not Done Reply Inline Actions I see now change here. Is something weird going on with the diff? nhaehnle: I see now change here. Is something weird going on with the diff?
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions I now initialise ThresholdLocal to be the max of UnrollThresholdLocal and UP.Threshold., so the value used will only be increased for PAL. timcorringham: I now initialise ThresholdLocal to be the max of UnrollThresholdLocal and UP.Threshold., so the…
		rampitecUnsubmitted Not Done Reply Inline Actions It still does not make sense. You are initializing general threshold higher (1100) than boosted (1000). rampitec: It still does not make sense. You are initializing general threshold higher (1100) than boosted…
"amdgpu-unroll-threshold-local",		"amdgpu-unroll-threshold-local",
cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),		cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
cl::init(1000), cl::Hidden);		cl::init(1000), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdIf(		static cl::opt<unsigned> UnrollThresholdIf(
"amdgpu-unroll-threshold-if",		"amdgpu-unroll-threshold-if",
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),		cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(150), cl::Hidden);		cl::init(150), cl::Hidden);
Show All 14 Lines	for (const Value *V : I->operand_values()) {
} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))		} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
return true;		return true;
}		}
return false;		return false;
}		}

void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {		TTI::UnrollingPreferences &UP) {
UP.Threshold = 300; // Twice the default.		UP.Threshold = 300; // Twice the default.
		arsenmUnsubmitted Not Done Reply Inline Actions This would now be dead arsenm: This would now be dead
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions This value is still used if the OS isn't PAL. timcorringham: This value is still used if the OS isn't PAL.
UP.MaxCount = std::numeric_limits<unsigned>::max();		UP.MaxCount = std::numeric_limits<unsigned>::max();
UP.Partial = true;		UP.Partial = true;

// TODO: Do we want runtime unrolling?		// TODO: Do we want runtime unrolling?

		// Set more aggressive defaults for PAL shaders
		if (TargetTriple.getOS() == Triple::AMDPAL) {
		arsenmUnsubmitted Not Done Reply Inline Actions These should probably be the same for all OSes arsenm: These should probably be the same for all OSes
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions That is quite possible, but I don't have tests to confirm that for all OSes. Since the effect of crossing a cliff-edge can be significant (good or bad) I don't want to risk making that change for OSes without performance figures to justify it. timcorringham: That is quite possible, but I don't have tests to confirm that for all OSes. Since the effect…
		UP.MaxPercentThresholdBoost = 1000;
		// and even more aggressive for GFX10
		if (Gen >= AMDGPUSubtarget::GFX10) {
		UP.Threshold = 1100;
		UP.PartialThreshold = 1100;
		} else {
		UP.Threshold = 700;
		UP.PartialThreshold = 700;
		}
		}

// Maximum alloca size than can fit registers. Reserve 16 registers.		// Maximum alloca size than can fit registers. Reserve 16 registers.
const unsigned MaxAlloca = (256 - 16) * 4;		const unsigned MaxAlloca = (256 - 16) * 4;
unsigned ThresholdPrivate = UnrollThresholdPrivate;		unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;		unsigned ThresholdLocal = UnrollThresholdLocal;
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);		unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
for (const BasicBlock *BB : L->getBlocks()) {		for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();		const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;		unsigned LocalGEPsSeen = 0;
▲ Show 20 Lines • Show All 691 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Amend target loop unroll defaults
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 224612

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Amend target loop unroll defaultsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 224612

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

[AMDGPU] Amend target loop unroll defaults
ClosedPublic