Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -32,13 +32,35 @@ static cl::opt UnrollThresholdPrivate( "amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), - cl::init(2000), cl::Hidden); + cl::init(2500), cl::Hidden); static cl::opt UnrollThresholdLocal( "amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden); +static cl::opt UnrollThresholdIf( + "amdgpu-unroll-threshold-if", + cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), + cl::init(150), cl::Hidden); + +static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, + unsigned Depth = 0) { + if (const Instruction *I = dyn_cast(Cond)) { + for (const Value *V : I->operand_values()) { + if (!L->contains(I)) + continue; + if (const PHINode *PHI = dyn_cast(V)) { + if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { + return SubLoop->contains(PHI); })) + return true; + } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) + return true; + } + } + return false; +} + void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. @@ -57,7 +79,33 @@ const DataLayout &DL = BB->getModule()->getDataLayout(); unsigned LocalGEPsSeen = 0; + if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { + return SubLoop->contains(BB); })) + continue; // Block belongs to an inner loop. + for (const Instruction &I : *BB) { + + // Unroll a loop which contains an "if" statement whose condition + // defined by a PHI belonging to the loop. This may help to eliminate + // if region and potentially even PHI itself, saving on both divergence + // and registers used for the PHI. + // Add a small bonus for each of such "if" statements. + if (const BranchInst *Br = dyn_cast(&I)) { + if (UP.Threshold < MaxBoost && Br->isConditional()) { + if (L->isLoopExiting(Br->getSuccessor(0)) || + L->isLoopExiting(Br->getSuccessor(1))) + continue; + if (dependsOnLocalPhi(L, Br->getCondition())) { + UP.Threshold += UnrollThresholdIf; + DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold + << " for loop:\n" << *L << " due to " << *Br << '\n'); + if (UP.Threshold >= MaxBoost) + return; + } + } + continue; + } + const GetElementPtrInst *GEP = dyn_cast(&I); if (!GEP) continue; @@ -128,7 +176,7 @@ UP.Threshold = Threshold; DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" << *L << " due to " << *GEP << '\n'); - if (UP.Threshold == MaxBoost) + if (UP.Threshold >= MaxBoost) return; } } Index: test/CodeGen/AMDGPU/unroll.ll =================================================================== --- test/CodeGen/AMDGPU/unroll.ll +++ test/CodeGen/AMDGPU/unroll.ll @@ -64,3 +64,36 @@ exit: ret void } + +; Check that a loop with if inside completely unrolled to eliminate phi and if + +; CHECK: entry: +; CHECK-NEXT: getelementptr +; CHECK-NEXT: store +; CHECK-NEXT: getelementptr +; CHECK-NEXT: store +; CHECK-NOT: br +define amdgpu_kernel void @unroll_for_if(i32* %a) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %and = and i32 %i1, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %0 = sext i32 %i1 to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %0 + store i32 0, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i1, 1 + %cmp = icmp ult i32 %inc, 48 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.cond + ret void +}