diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -46,7 +46,7 @@ Triple TargetTriple; - const TargetSubtargetInfo *ST; + const GCNSubtarget *ST; const TargetLoweringBase *TLI; const TargetSubtargetInfo *getST() const { return ST; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -96,10 +96,23 @@ // TODO: Do we want runtime unrolling? + // Set more aggressive defaults for PAL shaders + if (TargetTriple.getOS() == Triple::AMDPAL) { + UP.MaxPercentThresholdBoost = 1000; + // and even more aggressive for GFX10 + if (ST->getGeneration() >= AMDGPUSubtarget::GFX10) { + UP.Threshold = 1100; + UP.PartialThreshold = 1100; + } else { + UP.Threshold = 700; + UP.PartialThreshold = 700; + } + } + // Maximum alloca size than can fit registers. Reserve 16 registers. const unsigned MaxAlloca = (256 - 16) * 4; unsigned ThresholdPrivate = UnrollThresholdPrivate; - unsigned ThresholdLocal = UnrollThresholdLocal; + unsigned ThresholdLocal = std::max((unsigned)UnrollThresholdLocal, UP.Threshold); unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout();