Index: llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h +++ llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h @@ -73,9 +73,12 @@ /// operands bool isAlwaysUniform(const Value &Val) const; - /// \brief Whether \p Val is a divergent value + /// \brief Whether \p Val is divergent at its definition. bool isDivergent(const Value &Val) const; + /// \brief Whether \p U is divergent. Uses of a uniform value can be divergent. + bool isDivergentUse(const Use &U) const; + void print(raw_ostream &OS, const Module *) const; private: @@ -189,12 +192,19 @@ /// The GPU kernel this analysis result is for const Function &getFunction() const { return DA.getFunction(); } - /// Whether \p V is divergent. + /// Whether \p V is divergent at its definition. bool isDivergent(const Value &V) const; - /// Whether \p V is uniform/non-divergent + /// Whether \p U is divergent. Uses of a uniform value can be divergent. + bool isDivergentUse(const Use &U) const; + + /// Whether \p V is uniform/non-divergent. bool isUniform(const Value &V) const { return !isDivergent(V); } + /// Whether \p U is uniform/non-divergent. Uses of a uniform value can be + /// divergent. + bool isUniformUse(const Use &U) const { return !isDivergentUse(U); } + /// Print all divergent values in the kernel. void print(raw_ostream &OS, const Module *) const; }; Index: llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -39,17 +39,18 @@ void print(raw_ostream &OS, const Module *) const override; // Returns true if V is divergent at its definition. - // - // Even if this function returns false, V may still be divergent when used - // in a different basic block. bool isDivergent(const Value *V) const; + // Returns true if U is divergent. Uses of a uniform value can be divergent. + bool isDivergentUse(const Use *U) const; + // Returns true if V is uniform/non-divergent. - // - // Even if this function returns true, V may still be divergent when used - // in a different basic block. bool isUniform(const Value *V) const { return !isDivergent(V); } + // Returns true if U is uniform/non-divergent. Uses of a uniform value can be + // divergent. + bool isUniformUse(const Use *U) const { return !isDivergentUse(U); } + // Keep the analysis results uptodate by removing an erased value. void removeValue(const Value *V) { DivergentValues.erase(V); } @@ -62,6 +63,9 @@ // Stores all divergent values. DenseSet DivergentValues; + + // Stores divergent uses of possibly uniform values. + DenseSet DivergentUses; }; } // End llvm namespace Index: llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp +++ llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp @@ -412,6 +412,12 @@ return DivergentValues.find(&V) != DivergentValues.end(); } +bool DivergenceAnalysis::isDivergentUse(const Use &U) const { + Value &V = *U.get(); + Instruction &I = *cast(U.getUser()); + return isDivergent(V) || isTemporalDivergent(*I.getParent(), V); +} + void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { if (DivergentValues.empty()) return; @@ -449,6 +455,10 @@ return DA.isDivergent(val); } +bool GPUDivergenceAnalysis::isDivergentUse(const Use &use) const { + return DA.isDivergentUse(use); +} + void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; DA.print(OS, mod); Index: llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp +++ llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -93,8 +93,9 @@ class DivergencePropagator { public: DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - PostDominatorTree &PDT, DenseSet &DV) - : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} + PostDominatorTree &PDT, DenseSet &DV, + DenseSet &DU) + : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV), DU(DU) {} void populateWithSourcesOfDivergence(); void propagate(); @@ -118,11 +119,14 @@ PostDominatorTree &PDT; std::vector Worklist; // Stack for DFS. DenseSet &DV; // Stores all divergent values. + DenseSet &DU; // Stores divergent uses of possibly uniform + // values. }; void DivergencePropagator::populateWithSourcesOfDivergence() { Worklist.clear(); DV.clear(); + DU.clear(); for (auto &I : instructions(F)) { if (TTI.isSourceOfDivergence(&I)) { Worklist.push_back(&I); @@ -197,8 +201,10 @@ // dominators of TI until it is outside the influence region. BasicBlock *InfluencedBB = ThisBB; while (InfluenceRegion.count(InfluencedBB)) { - for (auto &I : *InfluencedBB) - findUsersOutsideInfluenceRegion(I, InfluenceRegion); + for (auto &I : *InfluencedBB) { + if (!DV.count(&I)) + findUsersOutsideInfluenceRegion(I, InfluenceRegion); + } DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); if (IDomNode == nullptr) break; @@ -208,9 +214,10 @@ void DivergencePropagator::findUsersOutsideInfluenceRegion( Instruction &I, const DenseSet &InfluenceRegion) { - for (User *U : I.users()) { - Instruction *UserInst = cast(U); + for (Use &Use : I.uses()) { + Instruction *UserInst = cast(Use.getUser()); if (!InfluenceRegion.count(UserInst->getParent())) { + DU.insert(&Use); if (DV.insert(UserInst).second) Worklist.push_back(UserInst); } @@ -320,6 +327,7 @@ return false; DivergentValues.clear(); + DivergentUses.clear(); gpuDA = nullptr; auto &DT = getAnalysis().getDomTree(); @@ -332,7 +340,7 @@ } else { // run LLVM's existing DivergenceAnalysis - DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues); + DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues, DivergentUses); DP.populateWithSourcesOfDivergence(); DP.propagate(); } @@ -351,6 +359,13 @@ return DivergentValues.count(V); } +bool LegacyDivergenceAnalysis::isDivergentUse(const Use *U) const { + if (gpuDA) { + return gpuDA->isDivergentUse(*U); + } + return DivergentValues.count(U->get()) || DivergentUses.count(U); +} + void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -142,11 +142,11 @@ // If the pointer operand is divergent, then each lane is doing an atomic // operation on a different address, and we cannot optimize that. - if (DA->isDivergent(I.getOperand(PtrIdx))) { + if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { return; } - const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if @@ -219,7 +219,7 @@ const unsigned ValIdx = 0; - const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if @@ -232,7 +232,7 @@ // If any of the other arguments to the intrinsic are divergent, we can't // optimize the operation. for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { - if (DA->isDivergent(I.getOperand(Idx))) { + if (DA->isDivergentUse(&I.getOperandUse(Idx))) { return; } } Index: llvm/trunk/test/CodeGen/AMDGPU/divergence-at-use.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/divergence-at-use.ll +++ llvm/trunk/test/CodeGen/AMDGPU/divergence-at-use.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true < %s -use-gpu-divergence-analysis | FileCheck %s + +@local = addrspace(3) global i32 undef + +define void @reducible(i32 %x) { +; CHECK-LABEL: reducible: +; CHECK-NOT: dpp +entry: + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] + %gep = getelementptr i32, i32 addrspace(3)* @local, i32 %i + %cond = icmp ult i32 %i, %x + %i1 = add i32 %i, 1 + br i1 %cond, label %loop, label %exit +exit: + %old = atomicrmw add i32 addrspace(3)* %gep, i32 %x acq_rel + ret void +}