Index: polly/trunk/include/polly/CodeGen/PerfMonitor.h =================================================================== --- polly/trunk/include/polly/CodeGen/PerfMonitor.h +++ polly/trunk/include/polly/CodeGen/PerfMonitor.h @@ -0,0 +1,132 @@ +//===--- PerfMonitor.h --- Monitor time spent in scops --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef PERF_MONITOR_H +#define PERF_MONITOR_H + +#include "polly/CodeGen/IRBuilder.h" + +namespace llvm { +class Function; +class Module; +class Value; +class Instruction; +} // namespace llvm + +namespace polly { + +class PerfMonitor { +public: + /// Create a new performance monitor. + /// + /// @param M The module for which to generate the performance monitor. + PerfMonitor(llvm::Module *M); + + /// Initialize the performance monitor. + /// + /// Ensure that all global variables, functions, and callbacks needed to + /// manage the performance monitor are initialized and registered. + void initialize(); + + /// Mark the beginning of a timing region. + /// + /// @param InsertBefore The instruction before which the timing region starts. + void insertRegionStart(llvm::Instruction *InserBefore); + + /// Mark the end of a timing region. + /// + /// @param InsertBefore The instruction before which the timing region starts. + void insertRegionEnd(llvm::Instruction *InsertBefore); + +private: + llvm::Module *M; + PollyIRBuilder Builder; + + /// Indicates if performance profiling is supported on this architecture. + bool Supported; + + /// The cycle counter at the beginning of the program execution. + llvm::Value *CyclesTotalStartPtr; + + /// The total number of cycles spent within scops. + llvm::Value *CyclesInScopsPtr; + + /// The value of the cycle counter at the beginning of the last scop. + llvm::Value *CyclesInScopStartPtr; + + /// A memory location which serves as argument of the RDTSCP function. + /// + /// The value written to this location is currently not used. + llvm::Value *RDTSCPWriteLocation; + + /// A global variable, that keeps track if the performance monitor + /// initialization has already been run. + llvm::Value *AlreadyInitializedPtr; + + llvm::Function *insertInitFunction(llvm::Function *FinalReporting); + + /// Add Function @p to list of global constructors + /// + /// If no global constructors are available in this current module, insert + /// a new list of global constructors containing @p Fn as only global + /// constructor. Otherwise, append @p Fn to the list of global constructors. + /// + /// All functions listed as global constructors are executed before the + /// main() function is called. + /// + /// @param Fn Function to add to global constructors + void addToGlobalConstructors(llvm::Function *Fn); + + /// Add global variables to module. + /// + /// Insert a set of global variables that are used to track performance, + /// into the module (or obtain references to them if they already exist). + void addGlobalVariables(); + + /// Get a reference to the intrinsic "i64 @llvm.x86.rdtscp(i8*)". + /// + /// The rdtscp function returns the current value of the processor's + /// time-stamp counter as well as the current CPU identifier. On modern x86 + /// systems, the returned value is independent of the dynamic clock frequency + /// and consistent across multiple cores. It can consequently be used to get + /// accurate and low-overhead timing information. Even though the counter is + /// wrapping, it can be reliably used even for measuring longer time + /// intervals, as on a 1 GHz processor the counter only wraps every 545 years. + /// + /// The RDTSCP instruction is "pseudo" serializing: + /// + /// "“The RDTSCP instruction waits until all previous instructions have been + /// executed before reading the counter. However, subsequent instructions may + /// begin execution before the read operation is performed.” + /// + /// To ensure that no later instructions are scheduled before the RDTSCP + /// instruction it is often recommended to schedule a cpuid call after the + /// RDTSCP instruction. We do not do this yet, trading some imprecision in + /// our timing for a reduced overhead in our timing. + /// + /// @returns A reference to the declaration of @llvm.x86.rdtscp. + llvm::Function *getRDTSCP(); + + /// Get a reference to "int atexit(void (*function)(void))" function. + /// + /// This function allows to register function pointers that must be executed + /// when the program is terminated. + /// + /// @returns A reference to @atexit(). + llvm::Function *getAtExit(); + + /// Create function "__polly_perf_final_reporting". + /// + /// This function finalizes the performance measurements and prints the + /// results to stdout. It is expected to be registered with 'atexit()'. + llvm::Function *insertFinalReporting(); +}; +} // namespace polly + +#endif Index: polly/trunk/lib/CMakeLists.txt =================================================================== --- polly/trunk/lib/CMakeLists.txt +++ polly/trunk/lib/CMakeLists.txt @@ -43,6 +43,7 @@ CodeGen/Utils.cpp CodeGen/RuntimeDebugBuilder.cpp CodeGen/CodegenCleanup.cpp + CodeGen/PerfMonitor.cpp ${GPGPU_CODEGEN_FILES} Exchange/JSONExporter.cpp Support/GICHelper.cpp Index: polly/trunk/lib/CodeGen/CodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/CodeGeneration.cpp +++ polly/trunk/lib/CodeGen/CodeGeneration.cpp @@ -21,6 +21,7 @@ #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/IslNodeBuilder.h" +#include "polly/CodeGen/PerfMonitor.h" #include "polly/CodeGen/Utils.h" #include "polly/DependenceInfo.h" #include "polly/LinkAllPasses.h" @@ -45,6 +46,11 @@ cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt + PerfMonitoring("polly-codegen-perf-monitoring", + cl::desc("Add run-time performance monitoring"), cl::Hidden, + cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + namespace { class CodeGeneration : public ScopPass { public: @@ -145,6 +151,18 @@ IslNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, S, StartBlock); + if (PerfMonitoring) { + PerfMonitor P(EnteringBB->getParent()->getParent()); + P.initialize(); + P.insertRegionStart(SplitBlock->getTerminator()); + + BasicBlock *MergeBlock = SplitBlock->getTerminator() + ->getSuccessor(0) + ->getUniqueSuccessor() + ->getUniqueSuccessor(); + P.insertRegionEnd(MergeBlock->getTerminator()); + } + // First generate code for the hoisted invariant loads and transitively the // parameters they reference. Afterwards, for the remaining parameters that // might reference the hoisted loads. Finally, build the runtime check Index: polly/trunk/lib/CodeGen/PerfMonitor.cpp =================================================================== --- polly/trunk/lib/CodeGen/PerfMonitor.cpp +++ polly/trunk/lib/CodeGen/PerfMonitor.cpp @@ -0,0 +1,235 @@ +//===------ PerfMonitor.cpp - Generate a run-time performance monitor. -======// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "polly/CodeGen/PerfMonitor.h" +#include "polly/CodeGen/RuntimeDebugBuilder.h" +#include "llvm/ADT/Triple.h" + +using namespace llvm; +using namespace polly; + +Function *PerfMonitor::getAtExit() { + const char *Name = "atexit"; + Function *F = M->getFunction(Name); + + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), + {Builder.getInt8PtrTy()}, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return F; +} + +void PerfMonitor::addToGlobalConstructors(Function *Fn) { + const char *Name = "llvm.global_ctors"; + GlobalVariable *GV = M->getGlobalVariable(Name); + std::vector V; + + if (GV) { + Constant *Array = GV->getInitializer(); + for (Value *X : Array->operand_values()) + V.push_back(cast(X)); + GV->eraseFromParent(); + } + + StructType *ST = StructType::get(Builder.getInt32Ty(), Fn->getType(), + Builder.getInt8PtrTy(), nullptr); + + V.push_back(ConstantStruct::get( + ST, Builder.getInt32(10), Fn, + ConstantPointerNull::get(Builder.getInt8PtrTy()), nullptr)); + ArrayType *Ty = ArrayType::get(ST, V.size()); + + GV = new GlobalVariable(*M, Ty, true, GlobalValue::AppendingLinkage, + ConstantArray::get(Ty, V), Name, nullptr, + GlobalVariable::NotThreadLocal); +} + +Function *PerfMonitor::getRDTSCP() { + const char *Name = "llvm.x86.rdtscp"; + Function *F = M->getFunction(Name); + + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get(Builder.getInt64Ty(), + {Builder.getInt8PtrTy()}, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return F; +} + +PerfMonitor::PerfMonitor(Module *M) : M(M), Builder(M->getContext()) { + if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64) + Supported = true; + else + Supported = false; +} + +void PerfMonitor::addGlobalVariables() { + auto TryRegisterGlobal = [=](const char *Name, Constant *InitialValue, + Value **Location) { + *Location = M->getGlobalVariable(Name); + + if (!*Location) + *Location = new GlobalVariable( + *M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage, + InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel); + }; + + TryRegisterGlobal("__polly_perf_cycles_total_start", Builder.getInt64(0), + &CyclesTotalStartPtr); + + TryRegisterGlobal("__polly_perf_initialized", Builder.getInt1(0), + &AlreadyInitializedPtr); + + TryRegisterGlobal("__polly_perf_cycles_in_scops", Builder.getInt64(0), + &CyclesInScopsPtr); + + TryRegisterGlobal("__polly_perf_cycles_in_scop_start", Builder.getInt64(0), + &CyclesInScopStartPtr); + + TryRegisterGlobal("__polly_perf_write_loation", Builder.getInt32(0), + &RDTSCPWriteLocation); +} + +static const char *InitFunctionName = "__polly_perf_init"; +static const char *FinalReportingFunctionName = "__polly_perf_final"; + +Function *PerfMonitor::insertFinalReporting() { + // Create new function. + GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false); + Function *ExitFn = + Function::Create(Ty, Linkage, FinalReportingFunctionName, M); + BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", ExitFn); + Builder.SetInsertPoint(Start); + + if (!Supported) { + RuntimeDebugBuilder::createCPUPrinter( + Builder, "Polly runtime information generation not supported\n"); + Builder.CreateRetVoid(); + return ExitFn; + } + + // Measure current cycles and compute final timings. + Function *RDTSCPFn = getRDTSCP(); + Value *CurrentCycles = Builder.CreateCall( + RDTSCPFn, + Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Value *CyclesStart = Builder.CreateLoad(CyclesTotalStartPtr, true); + Value *CyclesTotal = Builder.CreateSub(CurrentCycles, CyclesStart); + Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true); + + // Print the runtime information. + RuntimeDebugBuilder::createCPUPrinter(Builder, "Polly runtime information\n"); + RuntimeDebugBuilder::createCPUPrinter(Builder, "-------------------------\n"); + RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n"); + RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops, + "\n"); + + // Finalize function. + Builder.CreateRetVoid(); + return ExitFn; +} + +void PerfMonitor::initialize() { + addGlobalVariables(); + + Function *F = M->getFunction(InitFunctionName); + if (F) + return; + + // initialize + Function *FinalReporting = insertFinalReporting(); + Function *InitFn = insertInitFunction(FinalReporting); + addToGlobalConstructors(InitFn); +} + +Function *PerfMonitor::insertInitFunction(Function *FinalReporting) { + // Insert function definition and BBs. + GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false); + Function *InitFn = Function::Create(Ty, Linkage, InitFunctionName, M); + BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", InitFn); + BasicBlock *EarlyReturn = + BasicBlock::Create(M->getContext(), "earlyreturn", InitFn); + BasicBlock *InitBB = BasicBlock::Create(M->getContext(), "initbb", InitFn); + + Builder.SetInsertPoint(Start); + + // Check if this function was already run. If yes, return. + // + // In case profiling has been enabled in multiple translation units, the + // initializer function will be added to the global constructors list of + // each translation unit. When merging translation units, the global + // constructor lists are just appended, such that the initializer will appear + // multiple times. To avoid initializations being run multiple times (and + // especially to avoid that atExitFn is called more than once), we bail + // out if the intializer is run more than once. + Value *HasRunBefore = Builder.CreateLoad(AlreadyInitializedPtr); + Builder.CreateCondBr(HasRunBefore, EarlyReturn, InitBB); + Builder.SetInsertPoint(EarlyReturn); + Builder.CreateRetVoid(); + + // Keep track that this function has been run once. + Builder.SetInsertPoint(InitBB); + Value *True = Builder.getInt1(true); + Builder.CreateStore(True, AlreadyInitializedPtr); + + // Register the final reporting function with atexit(). + Value *FinalReportingPtr = + Builder.CreatePointerCast(FinalReporting, Builder.getInt8PtrTy()); + Function *AtExitFn = getAtExit(); + Builder.CreateCall(AtExitFn, {FinalReportingPtr}); + + if (Supported) { + // Read the currently cycle counter and store the result for later. + Function *RDTSCPFn = getRDTSCP(); + Value *CurrentCycles = Builder.CreateCall( + RDTSCPFn, + Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Builder.CreateStore(CurrentCycles, CyclesTotalStartPtr, true); + } + Builder.CreateRetVoid(); + + return InitFn; +} + +void PerfMonitor::insertRegionStart(Instruction *InsertBefore) { + if (!Supported) + return; + + Builder.SetInsertPoint(InsertBefore); + Function *RDTSCPFn = getRDTSCP(); + Value *CurrentCycles = Builder.CreateCall( + RDTSCPFn, + Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Builder.CreateStore(CurrentCycles, CyclesInScopStartPtr, true); +} + +void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) { + if (!Supported) + return; + + Builder.SetInsertPoint(InsertBefore); + Function *RDTSCPFn = getRDTSCP(); + LoadInst *CyclesStart = Builder.CreateLoad(CyclesInScopStartPtr, true); + Value *CurrentCycles = Builder.CreateCall( + RDTSCPFn, + Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Value *CyclesInScop = Builder.CreateSub(CurrentCycles, CyclesStart); + Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true); + CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop); + Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true); +} Index: polly/trunk/test/Isl/CodeGen/perf_monitoring.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/perf_monitoring.ll +++ polly/trunk/test/Isl/CodeGen/perf_monitoring.ll @@ -0,0 +1,87 @@ +; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \ +; RUN: -S < %s | FileCheck %s + +; void f(long A[], long N) { +; long i; +; if (true) +; for (i = 0; i < N; ++i) +; A[i] = i; +; } + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +define void @f(i64* %A, i64 %N) nounwind { +entry: + fence seq_cst + br label %next + +next: + br i1 true, label %for.i, label %return + +for.i: + %indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ] + %scevgep = getelementptr i64, i64* %A, i64 %indvar + store i64 %indvar, i64* %scevgep + %indvar.next = add nsw i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %N + br i1 %exitcond, label %return, label %for.i + +return: + fence seq_cst + ret void +} + +; CHECK: @__polly_perf_cycles_total_start = weak thread_local(initialexec) constant i64 0 +; CHECK-NEXT: @__polly_perf_initialized = weak thread_local(initialexec) constant i1 false +; CHECK-NEXT: @__polly_perf_cycles_in_scops = weak thread_local(initialexec) constant i64 0 +; CHECK-NEXT: @__polly_perf_cycles_in_scop_start = weak thread_local(initialexec) constant i64 0 +; CHECK-NEXT: @__polly_perf_write_loation = weak thread_local(initialexec) constant i32 0 + +; CHECK: polly.split_new_and_old: ; preds = %entry +; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) +; CHECK-NEXT: store volatile i64 %0, i64* @__polly_perf_cycles_in_scop_start + +; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting +; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start +; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) +; CHECK-NEXT: %7 = sub i64 %6, %5 +; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %9 = add i64 %8, %7 +; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: br label %return + + +; CHECK: define weak_odr void @__polly_perf_final() { +; CHECK-NEXT: start: +; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) +; CHECK-NEXT: %1 = load volatile i64, i64* @__polly_perf_cycles_total_start +; CHECK-NEXT: %2 = sub i64 %0, %1 +; CHECK-NEXT: %3 = load volatile i64, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %4 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @1, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @0, i32 0, i32 0)) +; CHECK-NEXT: %5 = call i32 @fflush(i8* null) +; CHECK-NEXT: %6 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @3, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @2, i32 0, i32 0)) +; CHECK-NEXT: %7 = call i32 @fflush(i8* null) +; CHECK-NEXT: %8 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @6, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @4, i32 0, i32 0), i64 %2, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @5, i32 0, i32 0)) +; CHECK-NEXT: %9 = call i32 @fflush(i8* null) +; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0)) +; CHECK-NEXT: %11 = call i32 @fflush(i8* null) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + + +; CHECK: define weak_odr void @__polly_perf_init() { +; CHECK-NEXT: start: +; CHECK-NEXT: %0 = load i1, i1* @__polly_perf_initialized +; CHECK-NEXT: br i1 %0, label %earlyreturn, label %initbb + +; CHECK: earlyreturn: ; preds = %start +; CHECK-NEXT: ret void + +; CHECK: initbb: ; preds = %start +; CHECK-NEXT: store i1 true, i1* @__polly_perf_initialized +; CHECK-NEXT: %1 = call i32 @atexit(i8* bitcast (void ()* @__polly_perf_final to i8*)) +; CHECK-NEXT: %2 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) +; CHECK-NEXT: store volatile i64 %2, i64* @__polly_perf_cycles_total_start +; CHECK-NEXT: ret void +; CHECK-NEXT: }