Diff 23566

llvm/trunk/include/llvm/Analysis/Passes.h

Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	namespace llvm {
//		//
// createDelinearizationPass - This pass implements attempts to restore		// createDelinearizationPass - This pass implements attempts to restore
// multidimensional array indices from linearized expressions.		// multidimensional array indices from linearized expressions.
//		//
FunctionPass *createDelinearizationPass();		FunctionPass *createDelinearizationPass();

//===--------------------------------------------------------------------===//		//===--------------------------------------------------------------------===//
//		//
		// createDivergenceAnalysisPass - This pass determines which branches in a GPU
		// program are divergent.
		//
		FunctionPass *createDivergenceAnalysisPass();

		//===--------------------------------------------------------------------===//
		//
// Minor pass prototypes, allowing us to expose them through bugpoint and		// Minor pass prototypes, allowing us to expose them through bugpoint and
// analyze.		// analyze.
FunctionPass *createInstCountPass();		FunctionPass *createInstCountPass();

//===--------------------------------------------------------------------===//		//===--------------------------------------------------------------------===//
//		//
// createRegionInfoPass - This pass finds all single entry single exit regions		// createRegionInfoPass - This pass finds all single entry single exit regions
// in a function and builds the region hierarchy.		// in a function and builds the region hierarchy.
Show All 23 Lines

llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 184 Lines • ▼ Show 20 Lines	public:
/// folded into a load or other instruction, but if they are used in some		/// folded into a load or other instruction, but if they are used in some
/// other context they may not be folded. This routine can distinguish such		/// other context they may not be folded. This routine can distinguish such
/// cases.		/// cases.
///		///
/// The returned cost is defined in terms of \c TargetCostConstants, see its		/// The returned cost is defined in terms of \c TargetCostConstants, see its
/// comments for a detailed explanation of the cost values.		/// comments for a detailed explanation of the cost values.
unsigned getUserCost(const User *U) const;		unsigned getUserCost(const User *U) const;

/// \brief hasBranchDivergence - Return true if branch divergence exists.		/// \brief Return true if branch divergence exists.
		///
/// Branch divergence has a significantly negative impact on GPU performance		/// Branch divergence has a significantly negative impact on GPU performance
/// when threads in the same wavefront take different paths due to conditional		/// when threads in the same wavefront take different paths due to conditional
/// branches.		/// branches.
bool hasBranchDivergence() const;		bool hasBranchDivergence() const;

		/// \brief Returns whether V is a source of divergence.
		///
		/// This function provides the target-dependent information for
		/// the target-independent DivergenceAnalysis. DivergenceAnalysis first
		/// builds the dependency graph, and then runs the reachability algorithm
		/// starting with the sources of divergence.
		bool isSourceOfDivergence(const Value *V) const;

/// \brief Test whether calls to a function lower to actual program function		/// \brief Test whether calls to a function lower to actual program function
/// calls.		/// calls.
///		///
/// The idea is to test whether the program is likely to require a 'call'		/// The idea is to test whether the program is likely to require a 'call'
/// instruction or equivalent in order to call the given function.		/// instruction or equivalent in order to call the given function.
///		///
/// FIXME: It's not clear that this is a good or useful query API. Client's		/// FIXME: It's not clear that this is a good or useful query API. Client's
/// should probably move to simpler cost metrics using the above.		/// should probably move to simpler cost metrics using the above.
▲ Show 20 Lines • Show All 308 Lines • ▼ Show 20 Lines	public:
virtual unsigned getCallCost(const Function *F,		virtual unsigned getCallCost(const Function *F,
ArrayRef<const Value *> Arguments) = 0;		ArrayRef<const Value *> Arguments) = 0;
virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys) = 0;		ArrayRef<Type *> ParamTys) = 0;
virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments) = 0;		ArrayRef<const Value *> Arguments) = 0;
virtual unsigned getUserCost(const User *U) = 0;		virtual unsigned getUserCost(const User *U) = 0;
virtual bool hasBranchDivergence() = 0;		virtual bool hasBranchDivergence() = 0;
		virtual bool isSourceOfDivergence(const Value *V) = 0;
virtual bool isLoweredToCall(const Function *F) = 0;		virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;		virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;		virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;		virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV,		virtual bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV,
int64_t BaseOffset, bool HasBaseReg,		int64_t BaseOffset, bool HasBaseReg,
int64_t Scale) = 0;		int64_t Scale) = 0;
virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0;		virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0;
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
return Impl.getIntrinsicCost(IID, RetTy, ParamTys);		return Impl.getIntrinsicCost(IID, RetTy, ParamTys);
}		}
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments) override {		ArrayRef<const Value *> Arguments) override {
return Impl.getIntrinsicCost(IID, RetTy, Arguments);		return Impl.getIntrinsicCost(IID, RetTy, Arguments);
}		}
unsigned getUserCost(const User *U) override { return Impl.getUserCost(U); }		unsigned getUserCost(const User *U) override { return Impl.getUserCost(U); }
bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }		bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
		bool isSourceOfDivergence(const Value *V) override {
		return Impl.isSourceOfDivergence(V);
		}
bool isLoweredToCall(const Function *F) override {		bool isLoweredToCall(const Function *F) override {
return Impl.isLoweredToCall(F);		return Impl.isLoweredToCall(F);
}		}
void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) override {		void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) override {
return Impl.getUnrollingPreferences(L, UP);		return Impl.getUnrollingPreferences(L, UP);
}		}
bool isLegalAddImmediate(int64_t Imm) override {		bool isLegalAddImmediate(int64_t Imm) override {
return Impl.isLegalAddImmediate(Imm);		return Impl.isLegalAddImmediate(Imm);
▲ Show 20 Lines • Show All 233 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 158 Lines • ▼ Show 20 Lines	unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::experimental_gc_relocate:		case Intrinsic::experimental_gc_relocate:
// These intrinsics don't actually represent code after lowering.		// These intrinsics don't actually represent code after lowering.
return TTI::TCC_Free;		return TTI::TCC_Free;
}		}
}		}

bool hasBranchDivergence() { return false; }		bool hasBranchDivergence() { return false; }

		bool isSourceOfDivergence(const Value *V) { return false; }

bool isLoweredToCall(const Function *F) {		bool isLoweredToCall(const Function *F) {
// FIXME: These should almost certainly not be handled here, and instead		// FIXME: These should almost certainly not be handled here, and instead
// handled with the help of TLI or the target itself. This was largely		// handled with the help of TLI or the target itself. This was largely
// ported from existing analysis heuristics here so that such refactorings		// ported from existing analysis heuristics here so that such refactorings
// can take place in the future.		// can take place in the future.

if (F->isIntrinsic())		if (F->isIntrinsic())
return false;		return false;
▲ Show 20 Lines • Show All 265 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 108 Lines • ▼ Show 20 Lines	BasicTTIImplBase &operator=(BasicTTIImplBase &&RHS) {
return *this;		return *this;
}		}

/// \name Scalar TTI Implementations		/// \name Scalar TTI Implementations
/// @{		/// @{

bool hasBranchDivergence() { return false; }		bool hasBranchDivergence() { return false; }

		bool isSourceOfDivergence(const Value *V) { return false; }

bool isLegalAddImmediate(int64_t imm) {		bool isLegalAddImmediate(int64_t imm) {
return getTLI()->isLegalAddImmediate(imm);		return getTLI()->isLegalAddImmediate(imm);
}		}

bool isLegalICmpImmediate(int64_t imm) {		bool isLegalICmpImmediate(int64_t imm) {
return getTLI()->isLegalICmpImmediate(imm);		return getTLI()->isLegalICmpImmediate(imm);
}		}

▲ Show 20 Lines • Show All 641 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/InitializePasses.h

	Show First 20 Lines • Show All 104 Lines • ▼ Show 20 Lines
	void initializeDAEPass(PassRegistry&);			void initializeDAEPass(PassRegistry&);
	void initializeDAHPass(PassRegistry&);			void initializeDAHPass(PassRegistry&);
	void initializeDCEPass(PassRegistry&);			void initializeDCEPass(PassRegistry&);
	void initializeDSEPass(PassRegistry&);			void initializeDSEPass(PassRegistry&);
	void initializeDeadInstEliminationPass(PassRegistry&);			void initializeDeadInstEliminationPass(PassRegistry&);
	void initializeDeadMachineInstructionElimPass(PassRegistry&);			void initializeDeadMachineInstructionElimPass(PassRegistry&);
	void initializeDelinearizationPass(PassRegistry &);			void initializeDelinearizationPass(PassRegistry &);
	void initializeDependenceAnalysisPass(PassRegistry&);			void initializeDependenceAnalysisPass(PassRegistry&);
				void initializeDivergenceAnalysisPass(PassRegistry&);
	void initializeDomOnlyPrinterPass(PassRegistry&);			void initializeDomOnlyPrinterPass(PassRegistry&);
	void initializeDomOnlyViewerPass(PassRegistry&);			void initializeDomOnlyViewerPass(PassRegistry&);
	void initializeDomPrinterPass(PassRegistry&);			void initializeDomPrinterPass(PassRegistry&);
	void initializeDomViewerPass(PassRegistry&);			void initializeDomViewerPass(PassRegistry&);
	void initializeDominanceFrontierPass(PassRegistry&);			void initializeDominanceFrontierPass(PassRegistry&);
	void initializeDominatorTreeWrapperPassPass(PassRegistry&);			void initializeDominatorTreeWrapperPassPass(PassRegistry&);
	void initializeEarlyIfConverterPass(PassRegistry&);			void initializeEarlyIfConverterPass(PassRegistry&);
	void initializeEdgeBundlesPass(PassRegistry&);			void initializeEdgeBundlesPass(PassRegistry&);
	▲ Show 20 Lines • Show All 180 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/LinkAllPasses.h

Show First 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	ForcePassLinking() {
(void) llvm::createConstantMergePass();		(void) llvm::createConstantMergePass();
(void) llvm::createConstantPropagationPass();		(void) llvm::createConstantPropagationPass();
(void) llvm::createCostModelAnalysisPass();		(void) llvm::createCostModelAnalysisPass();
(void) llvm::createDeadArgEliminationPass();		(void) llvm::createDeadArgEliminationPass();
(void) llvm::createDeadCodeEliminationPass();		(void) llvm::createDeadCodeEliminationPass();
(void) llvm::createDeadInstEliminationPass();		(void) llvm::createDeadInstEliminationPass();
(void) llvm::createDeadStoreEliminationPass();		(void) llvm::createDeadStoreEliminationPass();
(void) llvm::createDependenceAnalysisPass();		(void) llvm::createDependenceAnalysisPass();
		(void) llvm::createDivergenceAnalysisPass();
(void) llvm::createDomOnlyPrinterPass();		(void) llvm::createDomOnlyPrinterPass();
(void) llvm::createDomPrinterPass();		(void) llvm::createDomPrinterPass();
(void) llvm::createDomOnlyViewerPass();		(void) llvm::createDomOnlyViewerPass();
(void) llvm::createDomViewerPass();		(void) llvm::createDomViewerPass();
(void) llvm::createGCOVProfilerPass();		(void) llvm::createGCOVProfilerPass();
(void) llvm::createInstrProfilingPass();		(void) llvm::createInstrProfilingPass();
(void) llvm::createFunctionInliningPass();		(void) llvm::createFunctionInliningPass();
(void) llvm::createAlwaysInlinerPass();		(void) llvm::createAlwaysInlinerPass();
▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

llvm/trunk/lib/Analysis/Analysis.cpp

Show All 31 Lines	void llvm::initializeAnalysis(PassRegistry &Registry) {
initializeCostModelAnalysisPass(Registry);		initializeCostModelAnalysisPass(Registry);
initializeCFGViewerPass(Registry);		initializeCFGViewerPass(Registry);
initializeCFGPrinterPass(Registry);		initializeCFGPrinterPass(Registry);
initializeCFGOnlyViewerPass(Registry);		initializeCFGOnlyViewerPass(Registry);
initializeCFGOnlyPrinterPass(Registry);		initializeCFGOnlyPrinterPass(Registry);
initializeCFLAliasAnalysisPass(Registry);		initializeCFLAliasAnalysisPass(Registry);
initializeDependenceAnalysisPass(Registry);		initializeDependenceAnalysisPass(Registry);
initializeDelinearizationPass(Registry);		initializeDelinearizationPass(Registry);
		initializeDivergenceAnalysisPass(Registry);
initializeDominanceFrontierPass(Registry);		initializeDominanceFrontierPass(Registry);
initializeDomViewerPass(Registry);		initializeDomViewerPass(Registry);
initializeDomPrinterPass(Registry);		initializeDomPrinterPass(Registry);
initializeDomOnlyViewerPass(Registry);		initializeDomOnlyViewerPass(Registry);
initializePostDomViewerPass(Registry);		initializePostDomViewerPass(Registry);
initializeDomOnlyPrinterPass(Registry);		initializeDomOnlyPrinterPass(Registry);
initializePostDomPrinterPass(Registry);		initializePostDomPrinterPass(Registry);
initializePostDomOnlyViewerPass(Registry);		initializePostDomOnlyViewerPass(Registry);
▲ Show 20 Lines • Show All 70 Lines • Show Last 20 Lines

llvm/trunk/lib/Analysis/CMakeLists.txt

Show All 14 Lines	add_llvm_library(LLVMAnalysis
CFLAliasAnalysis.cpp		CFLAliasAnalysis.cpp
CGSCCPassManager.cpp		CGSCCPassManager.cpp
CaptureTracking.cpp		CaptureTracking.cpp
CostModel.cpp		CostModel.cpp
CodeMetrics.cpp		CodeMetrics.cpp
ConstantFolding.cpp		ConstantFolding.cpp
Delinearization.cpp		Delinearization.cpp
DependenceAnalysis.cpp		DependenceAnalysis.cpp
		DivergenceAnalysis.cpp
DomPrinter.cpp		DomPrinter.cpp
DominanceFrontier.cpp		DominanceFrontier.cpp
IVUsers.cpp		IVUsers.cpp
InstCount.cpp		InstCount.cpp
InstructionSimplify.cpp		InstructionSimplify.cpp
Interval.cpp		Interval.cpp
IntervalPartition.cpp		IntervalPartition.cpp
LazyCallGraph.cpp		LazyCallGraph.cpp
Show All 39 Lines

llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp

				//===- DivergenceAnalysis.cpp ------ Divergence Analysis ------------------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				// This file defines divergence analysis which determines whether a branch in a
				// GPU program is divergent. It can help branch optimizations such as jump
				// threading and loop unswitching to make better decisions.
				//
				// GPU programs typically use the SIMD execution model, where multiple threads
				// in the same execution group have to execute in lock-step. Therefore, if the
				// code contains divergent branches (i.e., threads in a group do not agree on
				// which path of the branch to take), the group of threads has to execute all
				// the paths from that branch with different subsets of threads enabled until
				// they converge at the immediately post-dominating BB of the paths.
				//
				// Due to this execution model, some optimizations such as jump
				// threading and loop unswitching can be unfortunately harmful when performed on
				// divergent branches. Therefore, an analysis that computes which branches in a
				// GPU program are divergent can help the compiler to selectively run these
				// optimizations.
				//
				// This file defines divergence analysis which computes a conservative but
				// non-trivial approximation of all divergent branches in a GPU program. It
				// partially implements the approach described in
				//
				// Divergence Analysis
				// Sampaio, Souza, Collange, Pereira
				// TOPLAS '13
				//
				// The divergence analysis identifies the sources of divergence (e.g., special
				// variables that hold the thread ID), and recursively marks variables that are
				// data or sync dependent on a source of divergence as divergent.
				//
				// While data dependency is a well-known concept, the notion of sync dependency
				// is worth more explanation. Sync dependence characterizes the control flow
				// aspect of the propagation of branch divergence. For example,
				//
				// %cond = icmp slt i32 %tid, 10
				// br i1 %cond, label %then, label %else
				// then:
				// br label %merge
				// else:
				// br label %merge
				// merge:
				// %a = phi i32 [ 0, %then ], [ 1, %else ]
				//
				// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
				// because %tid is not on its use-def chains, %a is sync dependent on %tid
				// because the branch "br i1 %cond" depends on %tid and affects which value %a
				// is assigned to.
				//
				// The current implementation has the following limitations:
				// 1. intra-procedural. It conservatively considers the arguments of a
				// non-kernel-entry function and the return value of a function call as
				// divergent.
				// 2. memory as black box. It conservatively considers values loaded from
				// generic or local address as divergent. This can be improved by leveraging
				// pointer analysis.
				//===----------------------------------------------------------------------===//

				#include <vector>
				#include "llvm/IR/Dominators.h"
				#include "llvm/ADT/DenseSet.h"
				#include "llvm/Analysis/Passes.h"
				#include "llvm/Analysis/PostDominators.h"
				#include "llvm/Analysis/TargetTransformInfo.h"
				#include "llvm/IR/Function.h"
				#include "llvm/IR/InstIterator.h"
				#include "llvm/IR/Instructions.h"
				#include "llvm/IR/IntrinsicInst.h"
				#include "llvm/IR/Value.h"
				#include "llvm/Pass.h"
				#include "llvm/Support/CommandLine.h"
				#include "llvm/Support/Debug.h"
				#include "llvm/Support/raw_ostream.h"
				#include "llvm/Transforms/Scalar.h"
				using namespace llvm;

				#define DEBUG_TYPE "divergence"

				namespace {
				class DivergenceAnalysis : public FunctionPass {
				public:
				static char ID;

				DivergenceAnalysis() : FunctionPass(ID) {
				initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry());
				}

				void getAnalysisUsage(AnalysisUsage &AU) const override {
				AU.addRequired<DominatorTreeWrapperPass>();
				AU.addRequired<PostDominatorTree>();
				AU.setPreservesAll();
				}

				bool runOnFunction(Function &F) override;

				// Print all divergent branches in the function.
				void print(raw_ostream &OS, const Module *) const override;

				// Returns true if V is divergent.
				bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
				// Returns true if V is uniform/non-divergent.
				bool isUniform(const Value *V) const { return !isDivergent(V); }

				private:
				// Stores all divergent values.
				DenseSet<const Value *> DivergentValues;
				};
				} // End of anonymous namespace

				// Register this pass.
				char DivergenceAnalysis::ID = 0;
				INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
				false, true)
				INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
				INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
				INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
				false, true)

				namespace {

				class DivergencePropagator {
				public:
				DivergencePropagator(Function &F, TargetTransformInfo &TTI,
				DominatorTree &DT, PostDominatorTree &PDT,
				DenseSet<const Value *> &DV)
				: F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {}
				void populateWithSourcesOfDivergence();
				void propagate();

				private:
				// A helper function that explores data dependents of V.
				void exploreDataDependency(Value *V);
				// A helper function that explores sync dependents of TI.
				void exploreSyncDependency(TerminatorInst *TI);
				// Computes the influence region from Start to End. This region includes all
				// basic blocks on any path from Start to End.
				void computeInfluenceRegion(BasicBlock Start, BasicBlock End,
				DenseSet<BasicBlock *> &InfluenceRegion);
				// Finds all users of I that are outside the influence region, and add these
				// users to Worklist.
				void findUsersOutsideInfluenceRegion(
				Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion);

				Function &F;
				TargetTransformInfo &TTI;
				DominatorTree &DT;
				PostDominatorTree &PDT;
				std::vector<Value *> Worklist; // Stack for DFS.
				DenseSet<const Value *> &DV; // Stores all divergent values.
				};

				void DivergencePropagator::populateWithSourcesOfDivergence() {
				Worklist.clear();
				DV.clear();
				for (auto &I : inst_range(F)) {
				if (TTI.isSourceOfDivergence(&I)) {
				Worklist.push_back(&I);
				DV.insert(&I);
				}
				}
				for (auto &Arg : F.args()) {
				if (TTI.isSourceOfDivergence(&Arg)) {
				Worklist.push_back(&Arg);
				DV.insert(&Arg);
				}
				}
				}

				void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
				// Propagation rule 1: if branch TI is divergent, all PHINodes in TI's
				// immediate post dominator are divergent. This rule handles if-then-else
				// patterns. For example,
				//
				// if (tid < 5)
				// a1 = 1;
				// else
				// a2 = 2;
				// a = phi(a1, a2); // sync dependent on (tid < 5)
				BasicBlock *ThisBB = TI->getParent();
				BasicBlock *IPostDom = PDT.getNode(ThisBB)->getIDom()->getBlock();
				if (IPostDom == nullptr)
				return;

				for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) {
				// A PHINode is uniform if it returns the same value no matter which path is
				// taken.
				if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(I).second)
				Worklist.push_back(I);
				}

				// Propagation rule 2: if a value defined in a loop is used outside, the user
				// is sync dependent on the condition of the loop exits that dominate the
				// user. For example,
				//
				// int i = 0;
				// do {
				// i++;
				// if (foo(i)) ... // uniform
				// } while (i < tid);
				// if (bar(i)) ... // divergent
				//
				// A program may contain unstructured loops. Therefore, we cannot leverage
				// LoopInfo, which only recognizes natural loops.
				//
				// The algorithm used here handles both natural and unstructured loops. Given
				// a branch TI, we first compute its influence region, the union of all simple
				// paths from TI to its immediate post dominator (IPostDom). Then, we search
				// for all the values defined in the influence region but used outside. All
				// these users are sync dependent on TI.
				DenseSet<BasicBlock *> InfluenceRegion;
				computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion);
				// An insight that can speed up the search process is that all the in-region
				// values that are used outside must dominate TI. Therefore, instead of
				// searching every basic blocks in the influence region, we search all the
				// dominators of TI until it is outside the influence region.
				BasicBlock *InfluencedBB = ThisBB;
				while (InfluenceRegion.count(InfluencedBB)) {
				for (auto &I : *InfluencedBB)
				findUsersOutsideInfluenceRegion(I, InfluenceRegion);
				DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom();
				if (IDomNode == nullptr)
				break;
				InfluencedBB = IDomNode->getBlock();
				}
				}

				void DivergencePropagator::findUsersOutsideInfluenceRegion(
				Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion) {
				for (User *U : I.users()) {
				Instruction *UserInst = cast<Instruction>(U);
				if (!InfluenceRegion.count(UserInst->getParent())) {
				if (DV.insert(UserInst).second)
				Worklist.push_back(UserInst);
				}
				}
				}

				void DivergencePropagator::computeInfluenceRegion(
				BasicBlock Start, BasicBlock End,
				DenseSet<BasicBlock *> &InfluenceRegion) {
				assert(PDT.properlyDominates(End, Start) &&
				"End does not properly dominate Start");
				std::vector<BasicBlock *> InfluenceStack;
				InfluenceStack.push_back(Start);
				InfluenceRegion.insert(Start);
				while (!InfluenceStack.empty()) {
				BasicBlock *BB = InfluenceStack.back();
				InfluenceStack.pop_back();
				for (BasicBlock *Succ : successors(BB)) {
				if (End != Succ && InfluenceRegion.insert(Succ).second)
				InfluenceStack.push_back(Succ);
				}
				}
				}

				void DivergencePropagator::exploreDataDependency(Value *V) {
				// Follow def-use chains of V.
				for (User *U : V->users()) {
				Instruction *UserInst = cast<Instruction>(U);
				if (DV.insert(UserInst).second)
				Worklist.push_back(UserInst);
				}
				}

				void DivergencePropagator::propagate() {
				// Traverse the dependency graph using DFS.
				while (!Worklist.empty()) {
				Value *V = Worklist.back();
				Worklist.pop_back();
				if (TerminatorInst *TI = dyn_cast<TerminatorInst>(V)) {
				// Terminators with less than two successors won't introduce sync
				// dependency. Ignore them.
				if (TI->getNumSuccessors() > 1)
				exploreSyncDependency(TI);
				}
				exploreDataDependency(V);
				}
				}

				} /// end namespace anonymous

				FunctionPass *llvm::createDivergenceAnalysisPass() {
				return new DivergenceAnalysis();
				}

				bool DivergenceAnalysis::runOnFunction(Function &F) {
				auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
				if (TTIWP == nullptr)
				return false;

				TargetTransformInfo &TTI = TTIWP->getTTI(F);
				// Fast path: if the target does not have branch divergence, we do not mark
				// any branch as divergent.
				if (!TTI.hasBranchDivergence())
				return false;

				DivergentValues.clear();
				DivergencePropagator DP(F, TTI,
				getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
				getAnalysis<PostDominatorTree>(), DivergentValues);
				DP.populateWithSourcesOfDivergence();
				DP.propagate();
				return false;
				}

				void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
				if (DivergentValues.empty())
				return;
				const Value FirstDivergentValue = DivergentValues.begin();
				const Function *F;
				if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
				F = Arg->getParent();
				} else if (const Instruction *I =
				dyn_cast<Instruction>(FirstDivergentValue)) {
				F = I->getParent()->getParent();
				} else {
				llvm_unreachable("Only arguments and instructions can be divergent");
				}

				// Dumps all divergent values in F, arguments and then instructions.
				for (auto &Arg : F->args()) {
				if (DivergentValues.count(&Arg))
				OS << "DIVERGENT: " << Arg << "\n";
				}
				// Iterate instructions using inst_range to ensure a deterministic order.
				for (auto &I : inst_range(F)) {
				if (DivergentValues.count(&I))
				OS << "DIVERGENT:" << I << "\n";
				}
				}

llvm/trunk/lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines
	unsigned TargetTransformInfo::getUserCost(const User *U) const {			unsigned TargetTransformInfo::getUserCost(const User *U) const {
	return TTIImpl->getUserCost(U);			return TTIImpl->getUserCost(U);
	}			}

	bool TargetTransformInfo::hasBranchDivergence() const {			bool TargetTransformInfo::hasBranchDivergence() const {
	return TTIImpl->hasBranchDivergence();			return TTIImpl->hasBranchDivergence();
	}			}

				bool TargetTransformInfo::isSourceOfDivergence(const Value *V) const {
				return TTIImpl->isSourceOfDivergence(V);
				}

	bool TargetTransformInfo::isLoweredToCall(const Function *F) const {			bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
	return TTIImpl->isLoweredToCall(F);			return TTIImpl->isLoweredToCall(F);
	}			}

	void TargetTransformInfo::getUnrollingPreferences(			void TargetTransformInfo::getUnrollingPreferences(
	Loop *L, UnrollingPreferences &UP) const {			Loop *L, UnrollingPreferences &UP) const {
	return TTIImpl->getUnrollingPreferences(L, UP);			return TTIImpl->getUnrollingPreferences(L, UP);
	}			}
	▲ Show 20 Lines • Show All 234 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	NVPTXTTIImpl &operator=(NVPTXTTIImpl &&RHS) {
BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));		BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
ST = std::move(RHS.ST);		ST = std::move(RHS.ST);
TLI = std::move(RHS.TLI);		TLI = std::move(RHS.TLI);
return *this;		return *this;
}		}

bool hasBranchDivergence() { return true; }		bool hasBranchDivergence() { return true; }

		bool isSourceOfDivergence(const Value *V);

unsigned getArithmeticInstrCost(		unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty,		unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,		TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,		TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,		TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);		TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

	//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//			//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
	//			//
	// The LLVM Compiler Infrastructure			// The LLVM Compiler Infrastructure
	//			//
	// This file is distributed under the University of Illinois Open Source			// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.			// License. See LICENSE.TXT for details.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#include "NVPTXTargetTransformInfo.h"			#include "NVPTXTargetTransformInfo.h"
				#include "NVPTXUtilities.h"
	#include "llvm/Analysis/LoopInfo.h"			#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"			#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"			#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/BasicTTIImpl.h"			#include "llvm/CodeGen/BasicTTIImpl.h"
	#include "llvm/Support/Debug.h"			#include "llvm/Support/Debug.h"
	#include "llvm/Target/CostTable.h"			#include "llvm/Target/CostTable.h"
	#include "llvm/Target/TargetLowering.h"			#include "llvm/Target/TargetLowering.h"
	using namespace llvm;			using namespace llvm;

	#define DEBUG_TYPE "NVPTXtti"			#define DEBUG_TYPE "NVPTXtti"

				// Whether the given intrinsic reads threadIdx.x/y/z.
				static bool readsThreadIndex(const IntrinsicInst *II) {
				switch (II->getIntrinsicID()) {
				default: return false;
				case Intrinsic::nvvm_read_ptx_sreg_tid_x:
				case Intrinsic::nvvm_read_ptx_sreg_tid_y:
				case Intrinsic::nvvm_read_ptx_sreg_tid_z:
				return true;
				}
				}

				static bool readsLaneId(const IntrinsicInst *II) {
				return II->getIntrinsicID() == Intrinsic::ptx_read_laneid;
				}

				// Whether the given intrinsic is an atomic instruction in PTX.
				static bool isNVVMAtomic(const IntrinsicInst *II) {
				switch (II->getIntrinsicID()) {
				default: return false;
				case Intrinsic::nvvm_atomic_load_add_f32:
				case Intrinsic::nvvm_atomic_load_inc_32:
				case Intrinsic::nvvm_atomic_load_dec_32:
				return true;
				}
				}

				bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
				// Without inter-procedural analysis, we conservatively assume that arguments
				// to __device__ functions are divergent.
				if (const Argument *Arg = dyn_cast<Argument>(V))
				return !isKernelFunction(*Arg->getParent());

				if (const Instruction *I = dyn_cast<Instruction>(V)) {
				// Without pointer analysis, we conservatively assume values loaded from
				// generic or local address space are divergent.
				if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
				unsigned AS = LI->getPointerAddressSpace();
				return AS == ADDRESS_SPACE_GENERIC \|\| AS == ADDRESS_SPACE_LOCAL;
				}
				// Atomic instructions may cause divergence. Atomic instructions are
				// executed sequentially across all threads in a warp. Therefore, an earlier
				// executed thread may see different memory inputs than a later executed
				// thread. For example, suppose *a = 0 initially.
				//
				// atom.global.add.s32 d, [a], 1
				//
				// returns 0 for the first thread that enters the critical region, and 1 for
				// the second thread.
				if (I->isAtomic())
				return true;
				if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
				// Instructions that read threadIdx are obviously divergent.
				if (readsThreadIndex(II) \|\| readsLaneId(II))
				return true;
				// Handle the NVPTX atomic instrinsics that cannot be represented as an
				// atomic IR instruction.
				if (isNVVMAtomic(II))
				return true;
				}
				// Conservatively consider the return value of function calls as divergent.
				// We could analyze callees with bodies more precisely using
				// inter-procedural analysis.
				if (isa<CallInst>(I))
				return true;
				}

				return false;
				}

	unsigned NVPTXTTIImpl::getArithmeticInstrCost(			unsigned NVPTXTTIImpl::getArithmeticInstrCost(
	unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,			unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
	TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,			TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
	TTI::OperandValueProperties Opd2PropInfo) {			TTI::OperandValueProperties Opd2PropInfo) {
	// Legalize the type.			// Legalize the type.
	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);			std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);

	int ISD = TLI->InstructionOpcodeToISD(Opcode);			int ISD = TLI->InstructionOpcodeToISD(Opcode);
	Show All 20 Lines

llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll

				; RUN: opt %s -analyze -divergence \| FileCheck %s

				target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
				target triple = "nvptx64-nvidia-cuda"

				; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
				define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge'
				entry:
				%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
				%cond = icmp slt i32 %n, 0
				br i1 %cond, label %then, label %else ; uniform
				; CHECK-NOT: DIVERGENT: br i1 %cond,
				then:
				%a1 = add i32 %a, %tid
				br label %merge
				else:
				%b2 = add i32 %b, %tid
				br label %merge
				merge:
				%c = phi i32 [ %a1, %then ], [ %b2, %else ]
				ret i32 %c
				}

				; c = a;
				; if (threadIdx.x < 5) // divergent: data dependent
				; c = b;
				; return c; // c is divergent: sync dependent
				define i32 @sync(i32 %a, i32 %b) {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync'
				bb1:
				%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
				%cond = icmp slt i32 %tid, 5
				br i1 %cond, label %bb2, label %bb3
				; CHECK: DIVERGENT: br i1 %cond,
				bb2:
				br label %bb3
				bb3:
				%c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid
				; CHECK: DIVERGENT: %c =
				ret i32 %c
				}

				; c = 0;
				; if (threadIdx.x >= 5) { // divergent
				; c = (n < 0 ? a : b); // c here is uniform because n is uniform
				; }
				; // c here is divergent because it is sync dependent on threadIdx.x >= 5
				; return c;
				define i32 @mixed(i32 %n, i32 %a, i32 %b) {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed'
				bb1:
				%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
				%cond = icmp slt i32 %tid, 5
				br i1 %cond, label %bb6, label %bb2
				; CHECK: DIVERGENT: br i1 %cond,
				bb2:
				%cond2 = icmp slt i32 %n, 0
				br i1 %cond2, label %bb4, label %bb3
				bb3:
				br label %bb5
				bb4:
				br label %bb5
				bb5:
				%c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ]
				; CHECK-NOT: DIVERGENT: %c =
				br label %bb6
				bb6:
				%c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ]
				; CHECK: DIVERGENT: %c2 =
				ret i32 %c2
				}

				; We conservatively treats all parameters of a __device__ function as divergent.
				define i32 @device(i32 %n, i32 %a, i32 %b) {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device'
				; CHECK: DIVERGENT: i32 %n
				; CHECK: DIVERGENT: i32 %a
				; CHECK: DIVERGENT: i32 %b
				entry:
				%cond = icmp slt i32 %n, 0
				br i1 %cond, label %then, label %else
				; CHECK: DIVERGENT: br i1 %cond,
				then:
				br label %merge
				else:
				br label %merge
				merge:
				%c = phi i32 [ %a, %then ], [ %b, %else ]
				ret i32 %c
				}

				; int i = 0;
				; do {
				; i++; // i here is uniform
				; } while (i < laneid);
				; return i == 10 ? 0 : 1; // i here is divergent
				;
				; The i defined in the loop is used outside.
				define i32 @loop() {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop'
				entry:
				%laneid = call i32 @llvm.ptx.read.laneid()
				br label %loop
				loop:
				%i = phi i32 [ 0, %entry ], [ %i1, %loop ]
				; CHECK-NOT: DIVERGENT: %i =
				%i1 = add i32 %i, 1
				%exit_cond = icmp sge i32 %i1, %laneid
				br i1 %exit_cond, label %loop_exit, label %loop
				loop_exit:
				%cond = icmp eq i32 %i, 10
				br i1 %cond, label %then, label %else
				; CHECK: DIVERGENT: br i1 %cond,
				then:
				ret i32 0
				else:
				ret i32 1
				}

				; Same as @loop, but the loop is in the LCSSA form.
				define i32 @lcssa() {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa'
				entry:
				%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
				br label %loop
				loop:
				%i = phi i32 [ 0, %entry ], [ %i1, %loop ]
				; CHECK-NOT: DIVERGENT: %i =
				%i1 = add i32 %i, 1
				%exit_cond = icmp sge i32 %i1, %tid
				br i1 %exit_cond, label %loop_exit, label %loop
				loop_exit:
				%i.lcssa = phi i32 [ %i, %loop ]
				; CHECK: DIVERGENT: %i.lcssa =
				%cond = icmp eq i32 %i.lcssa, 10
				br i1 %cond, label %then, label %else
				; CHECK: DIVERGENT: br i1 %cond,
				then:
				ret i32 0
				else:
				ret i32 1
				}

				; This test contains an unstructured loop.
				; +-------------- entry ----------------+
				; \| \|
				; V V
				; i1 = phi(0, i3) i2 = phi(0, i3)
				; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
				; ^ \| ^
				; \| V \|
				; +-------- switch (tid / i3) ----------+
				; \|
				; V
				; if (i3 == 5) // divergent
				; because sync dependent on (tid / i3).
				define i32 @unstructured_loop(i1 %entry_cond) {
				; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop'
				entry:
				%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
				br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
				loop_entry_1:
				%i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
				%j1 = add i32 %i1, 1
				br label %loop_body
				loop_entry_2:
				%i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
				%j2 = add i32 %i2, 2
				br label %loop_body
				loop_body:
				%i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
				br label %loop_latch
				loop_latch:
				%div = sdiv i32 %tid, %i3
				switch i32 %div, label %branch [ i32 1, label %loop_entry_1
				i32 2, label %loop_entry_2 ]
				branch:
				%cmp = icmp eq i32 %i3, 5
				br i1 %cmp, label %then, label %else
				; CHECK: DIVERGENT: br i1 %cmp,
				then:
				ret i32 0
				else:
				ret i32 1
				}

				declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
				declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
				declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
				declare i32 @llvm.ptx.read.laneid()

				!nvvm.annotations = !{!0, !1, !2, !3, !4}
				!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1}
				!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1}
				!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1}
				!3 = !{i32 ()* @loop, !"kernel", i32 1}
				!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1}

llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg

				if not 'NVPTX' in config.root.targets:
				config.unsupported = True

This is an archive of the discontinued LLVM Phabricator instance.

Divergence analysis for GPU programs
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 23566

llvm/trunk/include/llvm/Analysis/Passes.h

llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h

llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h

llvm/trunk/include/llvm/InitializePasses.h

llvm/trunk/include/llvm/LinkAllPasses.h

llvm/trunk/lib/Analysis/Analysis.cpp

llvm/trunk/lib/Analysis/CMakeLists.txt

llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp

llvm/trunk/lib/Analysis/TargetTransformInfo.cpp

llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll

llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg

This is an archive of the discontinued LLVM Phabricator instance.

Divergence analysis for GPU programsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 23566

llvm/trunk/include/llvm/Analysis/Passes.h

llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h

llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h

llvm/trunk/include/llvm/InitializePasses.h

llvm/trunk/include/llvm/LinkAllPasses.h

llvm/trunk/lib/Analysis/Analysis.cpp

llvm/trunk/lib/Analysis/CMakeLists.txt

llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp

llvm/trunk/lib/Analysis/TargetTransformInfo.cpp

llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll

llvm/trunk/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg

Divergence analysis for GPU programs
ClosedPublic