Index: polly/trunk/include/polly/CodeGen/PerfMonitor.h =================================================================== --- polly/trunk/include/polly/CodeGen/PerfMonitor.h +++ polly/trunk/include/polly/CodeGen/PerfMonitor.h @@ -25,8 +25,10 @@ public: /// Create a new performance monitor. /// + /// @param S The scop for which to generate fine-grained performance + /// monitoring information. /// @param M The module for which to generate the performance monitor. - PerfMonitor(llvm::Module *M); + PerfMonitor(const Scop &S, llvm::Module *M); /// Initialize the performance monitor. /// @@ -48,12 +50,18 @@ llvm::Module *M; PollyIRBuilder Builder; + // The scop to profile against. + const Scop &S; + /// Indicates if performance profiling is supported on this architecture. bool Supported; /// The cycle counter at the beginning of the program execution. llvm::Value *CyclesTotalStartPtr; + /// The total number of cycles spent in the current scop S. + llvm::Value *CyclesInCurrentScopPtr; + /// The total number of cycles spent within scops. llvm::Value *CyclesInScopsPtr; @@ -89,6 +97,12 @@ /// into the module (or obtain references to them if they already exist). void addGlobalVariables(); + /// Add per-scop tracking to module. + /// + /// Insert the global variable which is used to track the number of cycles + /// this scop runs. + void addScopCounter(); + /// Get a reference to the intrinsic "i64 @llvm.x86.rdtscp(i8*)". /// /// The rdtscp function returns the current value of the processor's @@ -126,6 +140,12 @@ /// This function finalizes the performance measurements and prints the /// results to stdout. It is expected to be registered with 'atexit()'. llvm::Function *insertFinalReporting(); + + /// Append Scop reporting data to "__polly_perf_final_reporting". + /// + /// This function appends the current scop (S)'s information to the final + /// printing function. + void AppendScopReporting(); }; } // namespace polly Index: polly/trunk/include/polly/ScopInfo.h =================================================================== --- polly/trunk/include/polly/ScopInfo.h +++ polly/trunk/include/polly/ScopInfo.h @@ -2329,6 +2329,14 @@ /// Check if the SCoP has been optimized by the scheduler. bool isOptimized() const { return IsOptimized; } + /// Get the name of the entry and exit blocks of this Scop. + /// + /// These along with the function name can uniquely identify a Scop. + /// + /// @return std::pair whose first element is the entry name & second element + /// is the exit name. + std::pair getEntryExitStr() const; + /// Get the name of this Scop. std::string getNameStr() const; Index: polly/trunk/lib/Analysis/ScopInfo.cpp =================================================================== --- polly/trunk/lib/Analysis/ScopInfo.cpp +++ polly/trunk/lib/Analysis/ScopInfo.cpp @@ -4126,6 +4126,12 @@ std::string Scop::getNameStr() const { std::string ExitName, EntryName; + std::tie(EntryName, ExitName) = getEntryExitStr(); + return EntryName + "---" + ExitName; +} + +std::pair Scop::getEntryExitStr() const { + std::string ExitName, EntryName; raw_string_ostream ExitStr(ExitName); raw_string_ostream EntryStr(EntryName); @@ -4138,7 +4144,7 @@ } else ExitName = "FunctionExit"; - return EntryName + "---" + ExitName; + return std::make_pair(EntryName, ExitName); } __isl_give isl_set *Scop::getContext() const { return isl_set_copy(Context); } Index: polly/trunk/lib/CodeGen/CodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/CodeGeneration.cpp +++ polly/trunk/lib/CodeGen/CodeGeneration.cpp @@ -184,7 +184,7 @@ IslNodeBuilder NodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock); if (PerfMonitoring) { - PerfMonitor P(EnteringBB->getParent()->getParent()); + PerfMonitor P(S, EnteringBB->getParent()->getParent()); P.initialize(); P.insertRegionStart(SplitBlock->getTerminator()); Index: polly/trunk/lib/CodeGen/PerfMonitor.cpp =================================================================== --- polly/trunk/lib/CodeGen/PerfMonitor.cpp +++ polly/trunk/lib/CodeGen/PerfMonitor.cpp @@ -11,8 +11,10 @@ #include "polly/CodeGen/PerfMonitor.h" #include "polly/CodeGen/RuntimeDebugBuilder.h" +#include "polly/ScopInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Intrinsics.h" +#include using namespace llvm; using namespace polly; @@ -60,51 +62,73 @@ return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp); } -PerfMonitor::PerfMonitor(Module *M) : M(M), Builder(M->getContext()) { +PerfMonitor::PerfMonitor(const Scop &S, Module *M) + : M(M), Builder(M->getContext()), S(S) { if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64) Supported = true; else Supported = false; } -void PerfMonitor::addGlobalVariables() { - auto TryRegisterGlobal = [=](const char *Name, Constant *InitialValue, - Value **Location) { - *Location = M->getGlobalVariable(Name); - - if (!*Location) - *Location = new GlobalVariable( - *M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage, - InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel); - }; +static void TryRegisterGlobal(Module *M, const char *Name, + Constant *InitialValue, Value **Location) { + *Location = M->getGlobalVariable(Name); + + if (!*Location) + *Location = new GlobalVariable( + *M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage, + InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel); +}; + +// Generate a unique name that is usable as a LLVM name for a scop to name its +// performance counter. +static std::string GetScopUniqueVarname(const Scop &S) { + std::stringstream Name; + std::string EntryString, ExitString; + std::tie(EntryString, ExitString) = S.getEntryExitStr(); + + Name << "__polly_perf_cycles_in_" << std::string(S.getFunction().getName()) + << "_from__" << EntryString << "__to__" << ExitString; + return Name.str(); +} - TryRegisterGlobal("__polly_perf_cycles_total_start", Builder.getInt64(0), +void PerfMonitor::addScopCounter() { + const std::string varname = GetScopUniqueVarname(S); + TryRegisterGlobal(M, varname.c_str(), Builder.getInt64(0), + &CyclesInCurrentScopPtr); +} + +void PerfMonitor::addGlobalVariables() { + TryRegisterGlobal(M, "__polly_perf_cycles_total_start", Builder.getInt64(0), &CyclesTotalStartPtr); - TryRegisterGlobal("__polly_perf_initialized", Builder.getInt1(0), + TryRegisterGlobal(M, "__polly_perf_initialized", Builder.getInt1(0), &AlreadyInitializedPtr); - TryRegisterGlobal("__polly_perf_cycles_in_scops", Builder.getInt64(0), + TryRegisterGlobal(M, "__polly_perf_cycles_in_scops", Builder.getInt64(0), &CyclesInScopsPtr); - TryRegisterGlobal("__polly_perf_cycles_in_scop_start", Builder.getInt64(0), + TryRegisterGlobal(M, "__polly_perf_cycles_in_scop_start", Builder.getInt64(0), &CyclesInScopStartPtr); - TryRegisterGlobal("__polly_perf_write_loation", Builder.getInt32(0), + TryRegisterGlobal(M, "__polly_perf_write_loation", Builder.getInt32(0), &RDTSCPWriteLocation); } static const char *InitFunctionName = "__polly_perf_init"; static const char *FinalReportingFunctionName = "__polly_perf_final"; +static BasicBlock *FinalStartBB = nullptr; +static ReturnInst *ReturnFromFinal = nullptr; + Function *PerfMonitor::insertFinalReporting() { // Create new function. GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage; FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false); Function *ExitFn = Function::Create(Ty, Linkage, FinalReportingFunctionName, M); - BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", ExitFn); - Builder.SetInsertPoint(Start); + FinalStartBB = BasicBlock::Create(M->getContext(), "start", ExitFn); + Builder.SetInsertPoint(FinalStartBB); if (!Supported) { RuntimeDebugBuilder::createCPUPrinter( @@ -128,23 +152,42 @@ RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n"); RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops, "\n"); - - // Finalize function. - Builder.CreateRetVoid(); + ReturnFromFinal = Builder.CreateRetVoid(); return ExitFn; } +void PerfMonitor::AppendScopReporting() { + Builder.SetInsertPoint(FinalStartBB); + ReturnFromFinal->eraseFromParent(); + + Value *CyclesInCurrentScop = + Builder.CreateLoad(this->CyclesInCurrentScopPtr, true); + std::string EntryName, ExitName; + std::tie(EntryName, ExitName) = S.getEntryExitStr(); + + RuntimeDebugBuilder::createCPUPrinter( + Builder, "Scop(", S.getFunction().getName(), " |from: ", EntryName, + " |to: ", ExitName, "): ", CyclesInCurrentScop, "\n"); + + ReturnFromFinal = Builder.CreateRetVoid(); +} + +static Function *FinalReporting = nullptr; + void PerfMonitor::initialize() { addGlobalVariables(); + addScopCounter(); - Function *F = M->getFunction(InitFunctionName); - if (F) - return; + // Ensure that we only add the final reporting function once. + // On later invocations, append to the reporting function. + if (!FinalReporting) { + FinalReporting = insertFinalReporting(); + + Function *InitFn = insertInitFunction(FinalReporting); + addToGlobalConstructors(InitFn); + } - // initialize - Function *FinalReporting = insertFinalReporting(); - Function *InitFn = insertInitFunction(FinalReporting); - addToGlobalConstructors(InitFn); + AppendScopReporting(); } Function *PerfMonitor::insertInitFunction(Function *FinalReporting) { @@ -223,4 +266,8 @@ Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true); CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop); Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true); + + Value *CyclesInCurrentScop = Builder.CreateLoad(CyclesInCurrentScopPtr, true); + CyclesInCurrentScop = Builder.CreateAdd(CyclesInCurrentScop, CyclesInScop); + Builder.CreateStore(CyclesInCurrentScop, CyclesInCurrentScopPtr, true); } Index: polly/trunk/test/Isl/CodeGen/perf_monitoring.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/perf_monitoring.ll +++ polly/trunk/test/Isl/CodeGen/perf_monitoring.ll @@ -49,7 +49,6 @@ ; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops ; CHECK-NEXT: %9 = add i64 %8, %7 ; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops -; CHECK-NEXT: br label %return ; CHECK: define weak_odr void @__polly_perf_final() { @@ -66,8 +65,6 @@ ; CHECK-NEXT: %9 = call i32 @fflush(i8* null) ; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0)) ; CHECK-NEXT: %11 = call i32 @fflush(i8* null) -; CHECK-NEXT: ret void -; CHECK-NEXT: } ; CHECK: define weak_odr void @__polly_perf_init() { Index: polly/trunk/test/Isl/CodeGen/perf_monitoring_per_scop.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/perf_monitoring_per_scop.ll +++ polly/trunk/test/Isl/CodeGen/perf_monitoring_per_scop.ll @@ -0,0 +1,100 @@ +; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \ +; RUN: -S < %s | FileCheck %s + +; void f(long A[], long N) { +; long i; +; if (true) +; for (i = 0; i < N; ++i) +; A[i] = i; +; } +; void g(long A[], long N) { +; long i; +; if (true) +; for (i = 0; i < N; ++i) +; A[i] = i; +; } + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +define void @f(i64* %A, i64 %N) nounwind { +entry: + fence seq_cst + br label %next + +next: + br i1 true, label %for.i, label %return + +for.i: + %indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ] + %scevgep = getelementptr i64, i64* %A, i64 %indvar + store i64 %indvar, i64* %scevgep + %indvar.next = add nsw i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %N + br i1 %exitcond, label %return, label %for.i + +return: + fence seq_cst + ret void +} + + +define void @g(i64* %A, i64 %N) nounwind { +entry: + fence seq_cst + br label %next + +next: + br i1 true, label %for.i, label %return + +for.i: + %indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ] + %scevgep = getelementptr i64, i64* %A, i64 %indvar + store i64 %indvar, i64* %scevgep + %indvar.next = add nsw i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %N + br i1 %exitcond, label %return, label %for.i + +return: + fence seq_cst + ret void +} + +; Declaration of globals +; CHECK: @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0 +; CHECK: @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0 + +; Bumping up counter in f +; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting +; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start +; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) +; CHECK-NEXT: %7 = sub i64 %6, %5 +; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %9 = add i64 %8, %7 +; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" +; CHECK-NEXT: %11 = add i64 %10, %7 +; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" +; CHECK-NEXT: br label %return + +; Bumping up counter in g +; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting +; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start +; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) +; CHECK-NEXT: %7 = sub i64 %6, %5 +; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %9 = add i64 %8, %7 +; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" +; CHECK-NEXT: %11 = add i64 %10, %7 +; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" +; CHECK-NEXT: br label %return + +; Final reporting prints +; CHECK: %12 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" +; CHECK-NEXT: %13 = call i32 (...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @18, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @10, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @11, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(4)* @12, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @13, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(4)* @14, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @15, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @16, i32 0, i32 0), i64 %12, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @17, i32 0, i32 0)) +; CHECK-NEXT: %14 = call i32 @fflush(i8* null) +; CHECK-NEXT: %15 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" +; CHECK-NEXT: %16 = call i32 (...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @27, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @19, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @20, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(4)* @21, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @22, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(4)* @23, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @24, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @25, i32 0, i32 0), i64 %15, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @26, i32 0, i32 0)) +; CHECK-NEXT: %17 = call i32 @fflush(i8* null) +; CHECK-NEXT: ret void