Diff 230399

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines

#define DEBUG_TYPE "AMDGPUtti"		#define DEBUG_TYPE "AMDGPUtti"

static cl::opt<unsigned> UnrollThresholdPrivate(		static cl::opt<unsigned> UnrollThresholdPrivate(
"amdgpu-unroll-threshold-private",		"amdgpu-unroll-threshold-private",
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),		cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
cl::init(2700), cl::Hidden);		cl::init(2700), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdLocal(		static cl::opt<unsigned> UnrollThresholdLocal(
		rampitecUnsubmitted Done Reply Inline Actions This change penalizes loops which should have unroll boosted instead. Your new default thresholds are now higher than boosted. rampitec: This change penalizes loops which should have unroll boosted instead. Your new default…
		nhaehnleUnsubmitted Not Done Reply Inline Actions I see now change here. Is something weird going on with the diff? nhaehnle: I see now change here. Is something weird going on with the diff?
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions I now initialise ThresholdLocal to be the max of UnrollThresholdLocal and UP.Threshold., so the value used will only be increased for PAL. timcorringham: I now initialise ThresholdLocal to be the max of UnrollThresholdLocal and UP.Threshold., so the…
		rampitecUnsubmitted Not Done Reply Inline Actions It still does not make sense. You are initializing general threshold higher (1100) than boosted (1000). rampitec: It still does not make sense. You are initializing general threshold higher (1100) than boosted…
"amdgpu-unroll-threshold-local",		"amdgpu-unroll-threshold-local",
cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),		cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
cl::init(1000), cl::Hidden);		cl::init(1000), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdIf(		static cl::opt<unsigned> UnrollThresholdIf(
"amdgpu-unroll-threshold-if",		"amdgpu-unroll-threshold-if",
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),		cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(150), cl::Hidden);		cl::init(150), cl::Hidden);
Show All 14 Lines	for (const Value *V : I->operand_values()) {
} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))		} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
return true;		return true;
}		}
return false;		return false;
}		}

void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {		TTI::UnrollingPreferences &UP) {
UP.Threshold = 300; // Twice the default.		const Function &F = *L->getHeader()->getParent();
		UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
		arsenmUnsubmitted Not Done Reply Inline Actions This would now be dead arsenm: This would now be dead
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions This value is still used if the OS isn't PAL. timcorringham: This value is still used if the OS isn't PAL.
UP.MaxCount = std::numeric_limits<unsigned>::max();		UP.MaxCount = std::numeric_limits<unsigned>::max();
UP.Partial = true;		UP.Partial = true;

// TODO: Do we want runtime unrolling?		// TODO: Do we want runtime unrolling?

// Maximum alloca size than can fit registers. Reserve 16 registers.		// Maximum alloca size than can fit registers. Reserve 16 registers.
const unsigned MaxAlloca = (256 - 16) * 4;		const unsigned MaxAlloca = (256 - 16) * 4;
		arsenmUnsubmitted Not Done Reply Inline Actions These should probably be the same for all OSes arsenm: These should probably be the same for all OSes
		timcorringhamAuthorUnsubmitted Done Reply Inline Actions That is quite possible, but I don't have tests to confirm that for all OSes. Since the effect of crossing a cliff-edge can be significant (good or bad) I don't want to risk making that change for OSes without performance figures to justify it. timcorringham: That is quite possible, but I don't have tests to confirm that for all OSes. Since the effect…
unsigned ThresholdPrivate = UnrollThresholdPrivate;		unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;		unsigned ThresholdLocal = UnrollThresholdLocal;
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);		unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
for (const BasicBlock *BB : L->getBlocks()) {		for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();		const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;		unsigned LocalGEPsSeen = 0;

if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {		if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
▲ Show 20 Lines • Show All 805 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll

This file was added.

				; RUN: opt < %s -S -mtriple=amdgcn-- -basicaa -loop-unroll \| FileCheck %s

				; Check that the loop in unroll_default is not fully unrolled using the default
				; unroll threshold
				; CHECK-LABEL: @unroll_default
				; CHECK: entry:
				; CHECK: br i1 %cmp
				; CHECK: ret void

				; Check that the same loop in unroll_full is fully unrolled when the default
				; unroll threshold is increased by use of the amdgpu-unroll-threshold attribute
				; CHECK-LABEL: @unroll_full
				; CHECK: entry:
				; CHECK-NOT: br i1 %cmp
				; CHECK: ret void

				@in = internal unnamed_addr global i32* null, align 8
				@out = internal unnamed_addr global i32* null, align 8

				define void @unroll_default() {
				entry:
				br label %do.body

				do.body: ; preds = %entry
				%i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
				%v1 = load i64, i64* bitcast (i32** @in to i64*), align 8
				store i64 %v1, i64* bitcast (i32** @out to i64*), align 8
				%inc = add nsw i32 %i.0, 1
				%cmp = icmp slt i32 %inc, 100
				br i1 %cmp, label %do.body, label %do.end

				do.end: ; preds = %do.body
				ret void
				}

				define void @unroll_full() #0 {
				entry:
				br label %do.body

				do.body: ; preds = %entry
				%i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
				%v1 = load i64, i64* bitcast (i32** @in to i64*), align 8
				store i64 %v1, i64* bitcast (i32** @out to i64*), align 8
				%inc = add nsw i32 %i.0, 1
				%cmp = icmp slt i32 %inc, 100
				br i1 %cmp, label %do.body, label %do.end

				do.end: ; preds = %do.body
				ret void
				}

				attributes #0 = { "amdgpu-unroll-threshold"="1000" }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Amend target loop unroll defaults
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 230399

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Amend target loop unroll defaultsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 230399

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll

[AMDGPU] Amend target loop unroll defaults
ClosedPublic