diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -32,12 +32,10 @@ template <> void llvm::GenericUniformityAnalysisImpl::initialize() { for (auto &I : instructions(F)) { - if (TTI->isSourceOfDivergence(&I)) { - assert(!I.isTerminator()); + if (TTI->isSourceOfDivergence(&I)) markDivergent(I); - } else if (TTI->isAlwaysUniform(&I)) { + else if (TTI->isAlwaysUniform(&I)) addUniformOverride(I); - } } for (auto &Arg : F.args()) { if (TTI->isSourceOfDivergence(&Arg)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -17,7 +17,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" @@ -73,7 +73,7 @@ const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; DominatorTree *DT = nullptr; - LegacyDivergenceAnalysis *DA = nullptr; + UniformityInfo *UA = nullptr; Module *Mod = nullptr; const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; @@ -224,7 +224,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); // FIXME: Division expansion needs to preserve the dominator tree. if (!ExpandDiv64InIR) @@ -314,7 +314,7 @@ int TySize = DL.getTypeSizeInBits(Ty); Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); - return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I); + return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I); } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { @@ -519,7 +519,7 @@ return false; // Prefer scalar if this could be s_mul_i32 - if (DA->isUniform(&I)) + if (UA->isUniform(&I)) return false; Value *LHS = I.getOperand(0); @@ -1237,7 +1237,7 @@ return true; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I) && promoteUniformOpToI32(I)) + UA->isUniform(&I) && promoteUniformOpToI32(I)) return true; if (UseMul24Intrin && replaceMulWithMul24(I)) @@ -1367,7 +1367,7 @@ bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && - DA->isUniform(&I)) + UA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; @@ -1377,7 +1377,7 @@ bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I)) + UA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; @@ -1396,7 +1396,7 @@ bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I)) + UA->isUniform(&I)) Changed |= promoteUniformBitreverseToI32(I); return Changed; @@ -1419,7 +1419,7 @@ const AMDGPUTargetMachine &TM = TPC->getTM(); ST = &TM.getSubtarget(F); AC = &getAnalysis().getAssumptionCache(F); - DA = &getAnalysis(); + UA = &getAnalysis().getUniformityInfo(); auto *DTWP = getAnalysisIfAvailable(); DT = DTWP ? &DTWP->getDomTree() : nullptr; @@ -1459,7 +1459,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -200,11 +200,11 @@ ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: SROA -; GCN-O1-NEXT: Post-Dominator Tree Construction -; GCN-O1-NEXT: Natural Loop Information -; GCN-O1-NEXT: Legacy Divergence Analysis +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Canonicalize natural loops ; GCN-O1-NEXT: Scalar Evolution Analysis ; GCN-O1-NEXT: Loop Pass Manager @@ -494,8 +494,8 @@ ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: Nary reassociation ; GCN-O1-OPTS-NEXT: Early CSE -; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR optimizations ; GCN-O1-OPTS-NEXT: Canonicalize natural loops ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis @@ -801,8 +801,8 @@ ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Nary reassociation ; GCN-O2-NEXT: Early CSE -; GCN-O2-NEXT: Post-Dominator Tree Construction -; GCN-O2-NEXT: Legacy Divergence Analysis +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR optimizations ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: Scalar Evolution Analysis @@ -1114,8 +1114,8 @@ ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Nary reassociation ; GCN-O3-NEXT: Early CSE -; GCN-O3-NEXT: Post-Dominator Tree Construction -; GCN-O3-NEXT: Legacy Divergence Analysis +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Canonicalize natural loops