Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -249,7 +249,7 @@ void initializeScalarEvolutionPass(PassRegistry&); void initializeShrinkWrapPass(PassRegistry &); void initializeSimpleInlinerPass(PassRegistry&); -void initializeShadowStackGCLoweringPass(PassRegistry&); +void initializeShadowStackGCLoweringPass(PassRegistry&); void initializeRegisterCoalescerPass(PassRegistry&); void initializeSingleLoopExtractorPass(PassRegistry&); void initializeSinkingPass(PassRegistry&); @@ -300,6 +300,10 @@ void initializeDwarfEHPreparePass(PassRegistry&); void initializeFloat2IntPass(PassRegistry&); void initializeLoopDistributePass(PassRegistry&); +void initializeHexe(PassRegistry &); +void initializeWorkloadAnalysisPass(PassRegistry &); +void initializeWorkloadExtractorPass(PassRegistry &); +void initializeWorkloadTransformPass(PassRegistry &); } #endif Index: include/llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h @@ -0,0 +1,58 @@ +//===--- Transforms/Hexe/HeterogeneousAdaptors/Adaptor.h - Hexe --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// ===------- High-Level Description of the Hexe Adaptor Interface -------=== +/// This file provides the Hexe Workload Adaptor interface. Implementations of +/// this interface are responsible for transforming a Hexe Workload to a +/// particular set of conventions required by an accelerator and its programming +/// environment. This file also includes function declarations for the +/// initialization of the available plugins. +//===---------------------------------------------------------------------===// + + +#ifndef LLVM_TRANSFORMS_HEXE_HETEROGENEOUSADAPTORS_H +#define LLVM_TRANSFORMS_HEXE_HETEROGENEOUSADAPTORS_H + +#include +#include + +namespace llvm { + +class Function; +class Module; +class Triple; + +typedef std::tuple HexeFunctionInfoT; +typedef std::vector HexeFunctionInfoListT; + +class HexeWorkloadAdaptor { +public: + HexeWorkloadAdaptor() {}; + virtual ~HexeWorkloadAdaptor() {}; + + /// \brief It performs the required transformations. + /// After the completion of this, the Module should comply + /// to the conventions of the Adaptor. + virtual void transform(Module *Module, + const Triple &HostTriple, const Triple &AccelTriple, + const HexeFunctionInfoListT &FunctionList) = 0; + + /// It checks if a particular combination of Host and Accelerator Triples + /// is supported by the adaptor. + virtual bool isSupported(const Triple &HostTriple, + const Triple &AccelTriple) = 0; +}; + + +/// \brief It creates and returns an instrance of the Hexagon Adaptor. +HexeWorkloadAdaptor *createHexagonWorkloadAdaptor(); + +} + +#endif Index: include/llvm/Transforms/Hexe/Hexe.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/Hexe.h @@ -0,0 +1,64 @@ +//===-- Transforms/Hexe/Hexe.h - Heterogeneous Execution Engine -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This header file provides function prototypes for the instantiation of Hexe +/// Passes. It also declares the command line flags that control the Hexe +/// operations. +//===---------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_HEXE_HEXE_H +#define LLVM_TRANSFORMS_HEXE_HEXE_H + +#include "llvm/Support/CommandLine.h" +#include + +namespace llvm { + +class ModulePass; + + +//===------------------------------------------------------------------===// +// +// Workload Analysis Pass - This pass analyzes workloads for offloading to +// DSPs/GPUs +// + +ModulePass *createWorkloadAnalysisPass(); + + + +//===------------------------------------------------------------------===// +// +// Workload Extractor Pass - This pass extracts workloads for offloading to +// DSPs/GPUs +// + +ModulePass *createWorkloadExtractorPass(); + + +//===------------------------------------------------------------------===// +// +// Workload Extractor Pass - This pass extracts workloads for offloading to +// DSPs/GPUs +// + +FunctionPass *createWorkloadTransformPass(); + + +//===------------------------------------------------------------------===// +//===------------------------------------------------------------------===// +//HEXE FLAGS +extern cl::opt HexeFunctionCalls; +extern cl::opt HexeLoops; +extern cl::opt HexePolicy; +extern cl::opt HexeWorkloadFName; +extern cl::opt HexeAdaptor; +extern cl::opt HexeAdaptorCheck; +} + +#endif //LLVM_TRANSFORMS_HEXE_HEXE_H Index: include/llvm/Transforms/Hexe/InitializeHexePasses.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/InitializeHexePasses.h @@ -0,0 +1,28 @@ +//===------- Transforms/Hexe/InitializeHexePasses.h - Hexe -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This header file provides function prototypes for the initialization of +/// the Heterogeneous Execution Engine Passes. +//===---------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_HEXE_INITIALIZEHEXEPASSES_H +#define LLVM_TRANSFORMS_HEXE_INITIALIZEHEXEPASSES_H + +namespace llvm { +class PassRegistry; + +/// \file initializeHexe - Initialize all passes linked into +/// Hexe library. +void initializeHexe(PassRegistry &); +void initializeWorkloadAnalysisPass(PassRegistry &); +void initializeWorkloadExtractorPass(PassRegistry &); +void initializeWorkloadTransformPass(PassRegistry &); +} + +#endif //LLVM_TRANSFORMS_HEXE_INITIALIZEHEXEPASSES_H Index: include/llvm/Transforms/Hexe/Utils.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/Utils.h @@ -0,0 +1,68 @@ +//===-- Transforms/Hexe/Utils.h - Heterogeneous Execution Engine -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This header file provides prototypes for utility functions that +/// a)create Struct Types based on function interfaces. +/// b)Hexe Metadata read and write functions. +/// c)erase Hexe Metadata from a Module +//===---------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_HEXE_UTIL_H +#define LLVM_TRANSFORMS_HEXE_UTIL_H + +#include +#include +#include + +namespace llvm { + +class Function; +class FunctionType; +class LLVMContext; +class Module; +class StructType; +class Triple; + +typedef std::tuple HexeFunctionInfoT; +typedef std::tuple< Function *, unsigned, StructType *> HexeFunctionStructInfoT; +typedef std::vector HexeFunctionInfoListT; +typedef std::map FunctionHexeFunctionMapT; + +/// \brief getCompactFunctionStruct builds a Struct Type based on the +/// function interface of a function. The interface is given as the +/// Function Type FT. +// +/// \returns the Struct Type. +StructType *getCompactFunctionStruct(FunctionType *FT, LLVMContext &C); + +/// \brief readHexeMetadata reads the Hexe metadata from a Module. +/// +/// \param NumHexeFunctions: the number of Hexe functions included +/// in the module. +/// \param FunctionList: the list of functions. +/// \param HostTriple: the triple of the host platform. +void readHexeMetadata(Module *M, unsigned &NumHexeFunctions, + HexeFunctionInfoListT &FunctionList, Triple &HostTriple); + +/// \brief writeHexeMetadata writes the Hexe metadata to a Module. +/// +/// \param NumHexeFunctions: the number of Hexe functions included +/// in the module. +/// \param FunctionMap: the function map. +/// \param HostTriple: the triple of the host platform. +void writeHexeMetadata(Module *M, unsigned NumHexeFunctions, + const FunctionHexeFunctionMapT &FunctionMap, + const Triple &HostTriple); + +/// \brief erase Hexe Metadata from Module +void eraseHexeMetadata(Module *M); + +} + +#endif //LLVM_TRANSFORMS_HEXE_UTIL_H Index: include/llvm/Transforms/Hexe/WorkloadAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/WorkloadAnalysis.h @@ -0,0 +1,380 @@ +//===---------- Transforms/Hexe/WorkloadAnalysis.h - Hexe -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// ===----------- High-Level Description of the Workload Analysis ----------=== +/// Key Goal: Design and implement an analysis pass that can reason if a loop +/// or a function call is eligible for offloading to an accelerator. If a loop +/// or function call is considered eligible for offloading, the analysis +/// generates a Workload Information handler that can be used by the offloading +/// transformations. +/// +/// Key points: +/// Architecture constrains: +/// a) Memory Coherency +/// The main processor and the accelerator may share a joint or +/// disjoint memory with full, limited or no coherency support. +/// In case of systems where coherency is supported, it is also +/// proven that explicit communication and synchronization leads +/// to higher performance. It is important to be able to reason +/// about the memory a loop or function call accesses. Our approach +/// follows the assumption that the target system does not have +/// hardware support for coherency and runtime/driver operations +/// are required. However this can be relaxed to "take advantage" +/// of coherent systems. +/// b) Data Layout +/// A function or loop may access complex data types containing +/// scalars and pointers referring to arbitrary addresses in memory. +/// We constrain our analysis criteria to consider eligible for +/// offloading only loops and functions that access data types +/// which do not have nested pointer types. Any combination of +/// scalars, structs and arrays is supported. Again, this could +/// be relaxed for coherent systems. +/// c) Atomic, Specialized Instruction/Operations +/// We do not consider as eligible for offloading code that +/// contains atomics, and specialized instructions expressed +/// as intrinsics. The reason is that the main processor and +/// accelerator architectures may vary significantly in their +/// capabilities. Again, analysis specialization for particular +/// systems could be considered. +/// +/// +/// Hexe Approach: +/// The analysis operates in two steps: (1) Code and Memory Reference +/// Analysis for Functions and Loops and (2) Memory Allocation Use +/// Analysis. +/// +/// 1) Code and Memory Reference Analysis for Functions and Loops: +/// A) We analyze all the instructions of the Function and Loop +/// codes to decide if we support them for offloading. +/// +/// B) We then analyze the memory references performed by the +/// same code by analyzing the load and store instructions. +/// +/// a) In case of Functions: +/// The load and store instructions should either refer +/// to memory segments available as Global Variables or +/// memory segments provided by the function interface. +/// +/// b) In case of Loops: +/// The load and store instructions should either refer +/// to memory segments available as Global Variables or +/// to memory segments provided by their parent function +/// interface. +/// +/// 2) Memory Allocation Use Analysis +/// At this point we proceed our analysis only on functions and loops +/// that successfully passed the first step. The idea here is to +/// map the memory references we detected in the previous +/// step to actual Memory Allocations. We consider three type of +/// Allocations so far: a) Global Variables, b) Dynamic Allocations +/// (malloc) and c) Stack Allocations (alloca). +/// +/// Both Functions and Loops have memory references that either +/// are Global Variables or they are provided by the Function Interface. +/// +/// A) Global Variable References. +/// We already know that they are Global Variables and we know +/// their allocation, so there is nothing to be done here. +/// +/// B) Function Interface References (The interesting case). +/// The origin of the memory references is determined at the +/// call sites of the function. Each time a function is called +/// those references may be mapped to any type of Memory +/// Allocation and we need to analyze them. That is the reason +/// we consider Function Calls for offloading and not just +/// Function definitions. +/// +/// IMPORTANT NOTE: +/// A Loop is considered eligible for offloading only if all the +/// calls of its parent function can be successfully analyzed. +/// We apply this limitation because we want to avoid the need +/// for generating multiple code versions for the function +/// that hosts the loop. Generating multiple versions would be +/// the case if we have a situation where some call sites of the +/// parent function can be analyzed and some others not. +/// +/// +/// FUTURE PLANS (TODO): +/// A)Extend the analysis API to support user given information +/// about eligible code for offloading. This can be useful in the +/// following cases: +/// a) Compiling code with special pragmas or attributes. +/// b) Compiling Functional and Domain Specific Languages. +/// c) Exploit runtime information in case of combining +/// Hexe with MCJIT. +/// B)Enable a relaxed analysis mode for fully coherent systems +//===-----------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_HEXE_WORKLOADANALYSIS_H +#define LLVM_TRANSFORMS_HEXE_WORKLOADANALYSIS_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Pass.h" +#include +#include + +namespace llvm { + +class Function; +class Loop; +class Value; +class Type; +class LoopInfo; +class DominatorTree; +class CallGraph; +class TargetLibraryInfo; +class WorkloadAnalysis; + +/// \brief This enum class defines the available Workload Types +/// that are supported by the Workload Analysis. +/// +/// Available Workload Types: +/// Function, +/// Call, a Function Call +/// Loop +/// There are two variations of the above: +/// NoGlobals: the workload does not access global variables +/// WithGlobals: the workload accesses global variables +enum class WorkloadType { + FunctionNoGlobals, + FunctionWithGlobals, + CallNoGlobals, + CallWithGlobals, + LoopNoGlobals, + LoopWithGlobals +}; + +/// \brief It represents a Memory Reference +/// +/// This is a typedef of the AliasAnalysis::Location +typedef AliasAnalysis::Location WorkloadMemRef; + + +/// \brief This Comparator orders Workload Memory References +/// by their pointer Value. +struct WorkloadMemRefComparator { + bool operator() (const WorkloadMemRef &a, const WorkloadMemRef &b) const + { return a.Ptr WorkloadMemRefSet; + +/// \brief Objects of this class represent Workloads that +/// we consider for offloading. Those workloads can either +/// be Function Calls and Loops. +/// +/// Workloads of Function type are also supported but they +/// are exclusively used by the analysis internals. +/// +/// The reason we use this class design instead of using the +/// llvm virtual class style is that a Workload object gets +/// transformed across the different stages of the Workload +/// Analysis and it is more efficient to be able to mutate +/// the same object. +class WorkloadInfo { +public: + WorkloadInfo() {} + + /// \returns true if the Workload is a Function. + bool isFunction() const { + return ((type == WorkloadType::FunctionNoGlobals) || + (type == WorkloadType::FunctionWithGlobals)); + } + + /// \returns true if the Workload is a Function Call. + bool isCall() const { + return ((type == WorkloadType::CallNoGlobals) || + (type == WorkloadType::CallWithGlobals)); + } + + /// \returns true if the Workload is a Loop. + bool isLoop() const { + return ((type == WorkloadType::LoopNoGlobals) || + (type == WorkloadType::LoopWithGlobals)); + } + + /// \returns the function considered for offloading + Function *getFunction() const { return F; } + + /// \returns the loop considered for offloading + Loop *getLoop() const { return L; } + + /// \returns the Function Call + CallInst *getCall() const { return CI; } + + /// \returns the Interface Memory References + WorkloadMemRefSet &getInterfaceMemRefs() { return InterfaceMemRefs; } + + /// \returns the Global Variable Memory References + WorkloadMemRefSet &getGlobalMemRefs() { return GlobalMemRefs; } + + /// \returns the Global Variable Allocations accessed + /// by the workload + SetVector &getGlobalAllocations() { + return GlobalAllocations; + } + + /// \returns the Heap Memory Allocations accessed + /// by the Workload + SetVector &getMallocAllocations() { return MallocAllocations; } + + /// \returns the Stack Memory Allocations accessed + /// by the Workload + SetVector &getAllocaAllocations() { return AllocaAllocations; } + + private: + WorkloadType type; //The Workload Type + Function *F; //The function to get offloaded. + + union{ + CallInst *CI; //The Caller of the Function to get offloaded. + Loop *L; //The Loop to get extracted. + }; + + //Memory References + WorkloadMemRefSet InterfaceMemRefs; //passed by the function interface + WorkloadMemRefSet GlobalMemRefs; //global references (global vars) + + //Memory Allocations + SetVector GlobalAllocations; //Global Variables + SetVector MallocAllocations; //Heap Allocations + SetVector AllocaAllocations; //Stack Allocations + friend class WorkloadAnalysis; +}; + +typedef std::map CallWorkloadMapT; +typedef std::map LoopWorkloadMapT; +typedef std::map FunctionWorkloadT; +typedef std::set> FunctionCallersSet; + +/// \brief This Pass provides the Workload Analysis for the Heterogeneous +/// Execution Engine. It provides a high level interface. The user can +/// query the offloading eligibility of a Function Call or Loop and retrieve +/// a WorkloadInfo handler that can be used for the offloading transformations. +class WorkloadAnalysis : public ModulePass { +public: + WorkloadAnalysis(); + ~WorkloadAnalysis() {}; + static char ID; + /// \brief It requests analyses etc + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnModule(Module &M) override; + + /// It releases the memory used by the analysis + void releaseMemory() override; + + /// \brief This function prints the analyses results. + /// It provides the results for both analysis steps, + /// (a) Code and Memory Analysis and (b) Memory + /// Allocation Use Analysis. + /// + ///We report: + /// Code and Memory Reference Eligibility: + /// Function Workloads + /// Loop Workloads + /// Memory Allocation Use Eligibility: + /// Function Call Workloads (Functions as part + /// of a particular Function Call context) + /// Loops Workloads) + void print(raw_ostream &O, const Module *M) const override; + + /// \brief It checks if a particular Function call can be offloaded. + /// \returns true if the Function is eligible for offloading. + bool isEligibleForOffloading(const CallInst *CI) { + return getOffloadingHandler(CI); + } + + /// \brief It checks if a particular Loop can be offloaded. + /// \returns true if the Loop is eligible for offloading. + bool isEligibleForOffloading(const Loop *L) { + return getOffloadingHandler(L); + } + + /// \returns a WorkloadInfo handler if the Loop is + /// eligible for offloading, otherwise nullptr + WorkloadInfo *getOffloadingHandler(const CallInst *CI) { + auto WI=CallWorkloadMap.find( const_cast(CI) ); + return ( WI !=CallWorkloadMap.end() ) ? WI->second : nullptr; + } + + /// \returns a WorkloadInfo handler if the Loop is + /// eligible for offloading, otherwise nullptr + WorkloadInfo *getOffloadingHandler(const Loop *L) { + auto WI=LoopWorkloadMap.find( const_cast(L) ); + return ( WI !=LoopWorkloadMap.end() ) ? WI->second : nullptr; + } + + /// SetForOffloading notifies the analysis state that a particular + /// Workload is transformed for Offloading. If that Workload + /// is a function it also invalidates the eligibility of all its + /// nested loops. If the Workload is a Loop it invalidates the + /// eligibility of its parent function, its nested sub-loops and + /// all the loops that have a level value lower than the Loop. + bool setForOffloading(WorkloadInfo *WI); + + /// Transforming a Loop for offloading requires the extraction of the Loop + /// to a function before proceeding with the transformations. Calling this + /// member function updates the WorkloadInfo handler information after the + /// loop extraction. + bool MutateLoopToCallWorkload(WorkloadInfo *WI, Function *F, CallInst *CI); + + /// \returns all the Function Calls that are eligible for + /// offloading + CallWorkloadMapT & getCallWorkloads() { return CallWorkloadMap; } + + /// \returns all the Loops that are eligible for offloading + LoopWorkloadMapT & getLoopWorkloads() { return LoopWorkloadMap; } + + /// \returns a TargetLibraryInfo instance + TargetLibraryInfo *getTLI() { return TLI; } + +private: + // Documentation for the private member functions is available + // in the implementation file. + + void analyzeCodeMemRef(Function &F, FunctionWorkloadT &FunctionWorkloadMap, + LoopWorkloadMapT &EligibleLoopWorkloadMap); + bool analyzeCodeMemRefEligibility(Function *F, + FunctionWorkloadT &FunctionWorkloadMap); + bool analyzeCodeMemRefEligibility(Loop *L, + LoopWorkloadMapT &EligibleLoopWorkloadMap); + + void analyzeMemAllocUseEligibility(Function *F, WorkloadInfo *WI, + FunctionCallersSet &Callers); + void analyzeMemAllocUseEligibility(Loop *L, WorkloadInfo *WI, + FunctionCallersSet &Callers); + + // Workload of Function and Loops that passed the first step of + // the Analysis (Code and Memory Reference Analysis); + FunctionWorkloadT EligibleFunctionWorkloadMap; + LoopWorkloadMapT EligibleLoopWorkloadMap; + + // Workload of Function Calls and Loops that passed the second + // step of the Analysis (Memory Allocation Use Analysis); + LoopWorkloadMapT LoopWorkloadMap; + CallWorkloadMapT CallWorkloadMap; + + // Support Data Structures + std::map> ELoops; + std::map> ECalls; + std::vector GarbageCollector; + + // Analyses used by the Workload Analysis + AliasAnalysis *AA; + LoopInfo *LI; + DominatorTree *DT; + CallGraph *CG; + TargetLibraryInfo *TLI; + std::vector LoopInfoCache; +}; + +} + +#endif //LLVM_TRANSFORMS_HEXE_WORKLOADANALYSIS_H Index: include/llvm/Transforms/Hexe/WorkloadExtractor.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/WorkloadExtractor.h @@ -0,0 +1,296 @@ +//===---------- Transforms/Hexe/WorkloadExtractor.h - Hexe -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// ===- High-Level Description of the Workload Extraction Utils and Pass - === +/// Key Goal: Design and implement utility classes and a compiler pass that +/// extract loops or functions for offloading. The Workload Analysis provides +/// Workload Information handlers which describe the eligible Loops and +/// Functions. +/// +/// Key Points: +/// a) Function/Loop Code Extraction. We need to extract the workload code +/// in a new Module that will be compiled for the target accelerators. +/// However we need to remain accelerator agnostic for portability reasons. +/// +/// b) Preserve the original version of the workload on the host module. An +/// accelerator may not be available for use during the application +/// execution for various reasons (occupied, disabled etc). We should be +/// able to fall back on CPU execution. +/// +/// c) Inject the neccessary runtime calls and control flow. A runtime +/// library controls the scheduling between the CPU and the accelerator, +/// coherency and dispatch on the accelerator. +/// +/// d) Replace the memory allocation method of the memory allocations that +/// are accessed by workloads that will be offloaded to the accelerator. +/// This is done by using memory management functions provided by the +/// runtime library. +/// We may have to replace: +/// 1) Heap Allocations (e.g. Malloc function calls), trivial +/// 2) Stack Allocations, (calloc), a bit tricky. We have to +/// replace a stack allocation with a library memory allocation, +/// meaning that we have to release that memory explicitly when +/// it is not required anymore. +/// 3) Global Variables, a bit tricky again. Global variables are +/// allocated in the data segment of the process address space. +/// In our scheme we need to replace their allocation method to +/// use memory provided by the Hexe runtime library calls. We do +/// that by emitting constructor and destructor functions that +/// use the library calls to allocate memory for the Global +/// Variables. These constructors and destructors are automatically +/// called at process initialization and exit. +/// +/// This file provides the declaration of three classes: +/// a) HexeWorkload encapsulates the new module we use to clone workloads +/// that are extracted for offloading. It provides operations for adding new +/// functions, inserting the required Hexe Metadata and enforcing module +/// validity. +/// +/// b) WorkloadExtractUtil is the class that takes care of the actual code +/// transformations. It exposes a simple interface. +/// +/// c) WorkloadExtractor is a Module pass that uses the WorkloadExtractUtil +/// functionality and transforms the code to support Workload Offloading. The +/// user can control its behavior via command line flags. It can be considered +/// as a basic concept pass that can be extended to serve a specific use case. +/// +/// Some Terminology: +/// Host: the main system platform, the processor that runs the Operating +/// System and the main application code. +/// +/// Accelerator: the co-processor where we offload workloads for +/// computation. +/// +/// Host Module: The original code module that gets transformed to support +/// workload offloading and gets compiled and run on the host platform. +/// +/// Hexe Module: The Module where we extract Workloads for which we enable +/// offloading. This module gets compiled to every accelerator target. It +/// is accelerator agnostic. Specializing for a specific accelerator target +/// is job of the Workload Transform Pass. +/// +/// FUTURE PLANS (TODO): +/// a) Make workload dispatch asynchronous. Design of the runtime library is +/// done. Work is required on the compiler analysis. +/// b) Speculation and Runtime Checks. Work is required on both compiler and +/// runtime library. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_HEXE_WORKLOADEXTRACTOR_H +#define LLVM_TRANSFORMS_HEXE_WORKLOADEXTRACTOR_H + +#include +#include +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include + +namespace llvm { + +class WorkloadAnalysis; +class DataLayout; +class WorkloadInfo; +class DominatorTree; +class AllocaInst; +class TargetLibraryInfo; +class HexeWorkload; + +typedef std::tuple< Function *, unsigned, StructType *> HexeFunctionStructInfoT; +typedef std::map FunctionHexeFunctionMapT; + +/// HexeWorkload encapsulates the new module we use to save workloads +/// that are extracted for offloading. It provides operations for +/// adding new functions, inserting the required Hexe Metadata and enforcing +/// module validity. +class HexeWorkload { +public: + /// \brief The standard constructor + HexeWorkload(LLVMContext &Context, StringRef MName="Hexe_Workload"); + ~HexeWorkload() { delete M; delete DL; delete TargetTriple; } + + /// \brief sets the DataLayout and the Triple of the host platform. + void setDLandTriple(const DataLayout &DL, const Triple &TargetTriple); + + /// \brief inserts Hexe specific Metadata and writes the Module to a + /// file. + void writeFile(StringRef Filename); + + /// \brief inserts Hexe specific Metadata to the Module + void writeModule(Module *M); + + /// \brief We support a limited number of function calls + void validateSupportedFunctionCalls(); + + /// \brief Adds a new Function to the Hexe Workload Module + /// + /// \returns a newly created Function that is part of the + /// Hexe Workload, a unique Hexe ID and a StructType generated + /// by the function interface. + HexeFunctionStructInfoT addFunctionPrototype(Function *F, FunctionType *FT); + + /// \brief It checks if there is Mapping for the Host Function F + /// + /// \returns the corresponding Function that is part of the + /// Hexe Workload, a unique Hexe ID and a StructType generated + /// by the function interface, if there is a mapping. + /// Otherwise nullptr. + HexeFunctionStructInfoT * getMapping(const Function *F) + { + auto I=Mapping.find( const_cast(F) ); + return ( I!=Mapping.end() ) ? &(I->second) : nullptr; + } + + /// \brief It checks if there is a mapping for the host + /// Function F in the Hexe Workload. + /// + /// \returns true if exists + bool hasMapping(const Function *F) { return getMapping(F); } + + /// \returns the LLVM Context + LLVMContext &getContext() { return C; } + + /// \returns the DataLayout + DataLayout *getDataLayout() { return DL; } + + /// \returns the Triple + Triple *getTriple() { return TargetTriple; } + +private: + LLVMContext &C; // LLVM Context + StringRef MSID; // Hexe Module String ID + DataLayout *DL; // DataLayout of the Host platform + Triple *TargetTriple; + Module *M; + unsigned nextFunctionID; + + // Host Function Mapping to the Hexe Workload Function, its Hexe ID + // and the StructType generated by the original function interface. + FunctionHexeFunctionMapT Mapping; +}; + + +/// This class performs all the necessary code transformations for +/// enabling the offloading of one or more workloads to an accelerator. +/// The Workload Analysis provides WorkloadInfo handlers that are used +/// by this class to perform the necessary transformations. +class WorkloadExtractUtil { +public: + /// \brief The Standard Constructor + /// \param WA: Workload Analysis + /// \param HM: Host Module, the original code module that will be + /// compiled for the host architecture. + /// \param Host Pass, the pass that utilizes this class object. + WorkloadExtractUtil(WorkloadAnalysis *WA, Module *HM, Pass *HP, + bool SupportGlobals=true) { + this->WA=WA; + this->HM=HM; + this->HP=HP; + setHexeRTFunctions(); + } + + /// \brief Provide the HexeWorkload object which will + /// store the extracted workload code. + void registerHexeWorkload(HexeWorkload *HW) { + this->HW=HW; + } + + /// \brief It performs the necessary code transformations + /// to enable the offloading of a Workload to the accelerator. + /// The WorkloadInfo handler should be given by the Workload + /// Analysis. A Workload can either be a Function Call or a + /// Loop. + bool extractWorkloadCode(WorkloadInfo *WI); + + /// It performs the necessary replacement of the Memory Allocations + /// that are used by Workloads that have been transformed for + /// offloading. Important: Call this only after having completed all + /// the required extractWorkloadCode calls. + bool replaceMemAllocations(); + +private: + // Documentation for the private member functions is available + // in the implementation file. + + bool loopToFunction(WorkloadInfo *WI, WorkloadInfo *CW, DominatorTree *DT); + Function *cloneOrGetHexeFunction(WorkloadInfo *WI); + bool transformAndInjectGlueCode(WorkloadInfo *WI); + + Value *injectRuntimeSched(Instruction *II, + CallInst *CI, WorkloadInfo *WI); + Instruction *marshalAndOffload(Instruction *II, + CallInst *CI, WorkloadInfo *WI); + + void setHexeRTFunctions(); + void annotateMemAllocationsForReplacement(WorkloadInfo *WI); + + + WorkloadAnalysis *WA; //Hexe Workload Analysis + Module *HM; //Host Module + Pass *HP; //Host Pass + HexeWorkload *HW; //Hexe Workload Module + + // Hexe Runtime Function Declarations + Function *HexeDispatchCall; + Function *HexeCoherencyCall; + Function *HexeRuntimeSchedCall; + Function *HexeEventWaitCall; + Function *HexeMalloc; + Function *HexeFree; + + // Hexe Datatypes + StructType *HexeEventT; + StructType *MemoryAccessInfoT; + StructType *HexeKernelInfoT; + + // Memory Allocations used by workloads + // that need to be transformed to use + // Hexe memory allocation facilities. + SetVector MallocAllocations; + SetVector GlobalAllocations; + SetVector AllocaAllocations; +}; + +/// Available Access Modes for Memory Accesses +/// performed by the Workloads. +enum class MemAccessInfoAccessMode { + Read, + Write, + ReadWrite +}; + +/// WorkloadExtractor is a Module pass that uses the WorkloadExtractUtil +/// functionality and transforms the code to support Workload Offloading. The +/// user can control its behavior via command line flags. It can be considered +/// as a basic concept pass that can be extended to serve a specific use case. +class WorkloadExtractor : public ModulePass { +public: + static char ID; + WorkloadExtractor(); + ~WorkloadExtractor() {}; + + /// \brief It requests analyses etc + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// \brief It performs transformations that enable workload offloading. + /// The transformations can be controlled via the HexeFunctionCalls, + /// HexeLoops and HexePolicy flags. + bool runOnModule(Module &M) override; + + /// \brief It releases the memory used by the Extractor Pass. + void releaseMemory() override; + +private: + WorkloadAnalysis *WA; //Hexe Workload Analysis + HexeWorkload *HW; //Hexe Workload Module + WorkloadExtractUtil *WEU; //Workload Extraction Utility +}; + +} + +#endif Index: include/llvm/Transforms/Hexe/WorkloadTransform.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Hexe/WorkloadTransform.h @@ -0,0 +1,68 @@ +//===-------- Transforms/Hexe/WorkloadTransform.h - Hexe ----- -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// ===------- High-Level Description of the Workload Transform Pass -------=== +/// Key Goal: Design and implement a compiler pass that converts the Hexe +/// Workload we extract from host code modules to comply with the conventions +/// and limitations of particular accelerator environment. +/// +/// Depending the accelerator and its programming environment, we may have +/// to work on: +/// Function Interfaces +/// Enviroment Function Call Injection +/// ABI conversions +/// Big-Little Endianess +/// 32 vs 64 bit architectures. +/// +/// A modular design has been adopted. A virtual Adaptor class has been +/// designed. For every new convention that needs to be supported the +/// developer has to sub-class that interface, The interface is available +/// at: HeterogeneousAdaptors/Adaptors.h +/// +/// Future Plans (TODO): +/// a) Write an Adaptor for OpenCL/SPIR +/// b) Investigate more accelerator types and build a codebase of utility +/// classes that can be used across different acceleratos and conventions +/// This pass in early stage. +//===---------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_HEXE_WORKLOADTRANSFORM_H +#define LLVM_TRANSFORMS_HEXE_WORKLOADTRANSFORM_H + +#include "llvm/Pass.h" +#include +#include + +namespace llvm { + +typedef std::tuple HexeFunctionInfoT; +typedef std::vector HexeFunctionInfoListT; + +//Pass Code +class WorkloadTransform : public ModulePass { +public: + WorkloadTransform(); + ~WorkloadTransform(){}; + static char ID; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// \brief Initialize the right Adaptor and performs the required + /// code transformations + bool runOnModule(Module &M) override; + + //\brief It release the memory used by the pass + void releaseMemory() override; +private: + HexeFunctionInfoListT FL; + unsigned functionNum; +}; + +} + +#endif //LLVM_TRANSFORMS_HEXE_WORKLOADTRANSFORM_H Index: lib/Transforms/CMakeLists.txt =================================================================== --- lib/Transforms/CMakeLists.txt +++ lib/Transforms/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(Vectorize) add_subdirectory(Hello) add_subdirectory(ObjCARC) +add_subdirectory(Hexe) Index: lib/Transforms/Hexe/CMakeLists.txt =================================================================== --- /dev/null +++ lib/Transforms/Hexe/CMakeLists.txt @@ -0,0 +1,10 @@ +add_llvm_library(LLVMHexe + Hexe.cpp + WorkloadAnalysis.cpp + WorkloadExtractor.cpp + WorkloadTransform.cpp + Utils.cpp + HeterogeneousAdaptors/Hexagon.cpp + ) + +add_dependencies(LLVMHexe LLVMIRReader intrinsics_gen) Index: lib/Transforms/Hexe/HeterogeneousAdaptors/Hexagon.cpp =================================================================== --- /dev/null +++ lib/Transforms/Hexe/HeterogeneousAdaptors/Hexagon.cpp @@ -0,0 +1,384 @@ +// === --- Hexagon.cpp - Heterogeneous Execution Engine ------*- C++ -*-=== // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// === -----------------------------------------------------------------=== // +/// \file +/// This file contains the definition of the Hexagon Adaptor of the +/// Heterogeneous Execution Engine. This Adaptor supports convention +/// transformations for the Qualcomm Hexagon DSP. +/// +/// It supports two modes: +/// a) Host Architecture: arm 32 and Accelerator Architecture: hexagon. This +/// cooperates with the hexagon plugin of the Hexe runtime library. +// +/// b) The same Host and Accelerator architecture. This cooperates with the +/// accelerator debug plugin of the Hexe runtime library and servers debugging +/// purposes. It can be used to evaluate and test Hexe compiler and runtime +/// functionality without the need of an accelerator at all. +// === -----------------------------------------------------------------=== // + + + +#include "llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h" +#include "llvm/Transforms/Hexe/Utils.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +// Hexagon Adaptor Conventions. +// +// The host runtime/driver calls the following function: +// +// int __hexe_skel_invoke(uint32 handler, remote_buf* bufs); +// +// a) The first argument is an integer handler which describes +// which computation function needs to be called and how +// to parse the second argument. +// +// b)The second argument is an array of struct elements of the following +// type: +// +// struct remote_buf{ +// void *pointer; +// size_t size; +// }; +// +// This struct represents a memory segment that is shared between the +// host and the accelerator environment where the runtime and driver +// environment translates the pointer address to a valid one. +// +// We reserve the first element of the array for storing scalar +// arguments and the return value. The rest of the elements represent +// memory segments that are accessed by the computational function. +// +// According to our convention each computational function must have the +// following interface: +// +// int comp_function_interface(remote_buf* bufs); +// +// It takes a single argument, the array of shared segments. For each +// computational function we need to generate code that does the +// appropriate marshalling. +// +// The following macro is used to choose which computational function +// should be called from the __hexe_skel_invoke. +// +// #define CHOOSE_COMP_FUNCTION(hanlder) (((handler) >> 24) & 0x1f) +// +// Example: +// +// int hexe_skel_invoke(uint32 handler, remote_buf* bufs) +// { +// int functionID = CHOOSE_COMP_FUNCTION(handler); +// +// switch (functionID) { +// case 0: +// return compFunction0(bufs); +// case 1: +// return compFunction1(bufs); +// default: +// return 20; //error +// } +// } +// +// All the functions should return 0 on success and 20 in case of error. + + +/// Adaptor for the Hexagon convention +class HexagonWorkloadAdaptor : public HexeWorkloadAdaptor { + public: + HexagonWorkloadAdaptor() { } + ~HexagonWorkloadAdaptor() { } + + void transform(Module *M, const Triple &HTriple, const Triple &ATriple, + const HexeFunctionInfoListT &FL) override; + + bool isSupported(const Triple &HTriple, const Triple &ATriple) override; + + private: + /// \brief It transforms the computational functions of the Module to + /// have a compatible interface. It also generates the marshalling code + /// for reading the scalar and pointer arguments from the input + /// buffers. + void transformFunctions(Module *M, + const HexeFunctionInfoListT &IFL, + HexeFunctionInfoListT &OFL); + + /// \brief It transforms a computational function of the Module to + /// have a compatible interface. It also generates the marshalling code + /// for reading the scalar and pointer arguments from the input + /// buffers. + Function *transformFunction(Function *F); + + /// \brief It generates the skeleton function that the host runtime/ + /// driver calls. This function then calls the requrest computational + /// function. + Function *generateSkelInvoke(Module *M, const HexeFunctionInfoListT &FL); + void DataLayoutTripleUpdate(Module *M, Triple HTriple, Triple ATriple); + Type *BufferST; + Type *BufferSTP; +}; + + + +void HexagonWorkloadAdaptor::transform(Module *M, const Triple &HTriple, + const Triple &ATriple, const HexeFunctionInfoListT &FL) +{ + LLVMContext &C = M->getContext(); + Type *VoidPT = Type::getInt8PtrTy(C); + Type *Int32 = Type::getInt32Ty(C); + + //Buffer Struct Type + Type *ET[] = { VoidPT, Int32 }; + BufferST = StructType::create( ArrayRef(ET, 2) ); + BufferSTP = PointerType::getUnqual(BufferST); + + //It transforms the functions + HexeFunctionInfoListT OFL; + transformFunctions(M, FL, OFL); + + //It generates the Skel function + generateSkelInvoke(M, OFL); + + //updates the DataLayout and The Triple of the module + DataLayoutTripleUpdate(M, HTriple, ATriple); +} + +void HexagonWorkloadAdaptor::transformFunctions(Module *M, + const HexeFunctionInfoListT &IFL, + HexeFunctionInfoListT &OFL) +{ + for(auto I = IFL.begin(), IE = IFL.end(); I!= IE; ++I) { + Function *F; + unsigned FID; + std::tie(F, FID) = *I; + OFL.push_back(std::make_tuple(transformFunction(F), FID)); + } +} + + +Function *HexagonWorkloadAdaptor::transformFunction(Function *F) +{ + Module *M = F->getParent(); + LLVMContext &C = M->getContext(); + Type *Int32 = Type::getInt32Ty(C); + ConstantInt *Zero = ConstantInt::get(Type::getInt32Ty(C), 0); + FunctionType *OFT = F->getFunctionType(); + Type *OFST = getCompactFunctionStruct(OFT, C); + Type *OFSTP = PointerType::getUnqual(OFST); + + //function_interface + FunctionType *UserFT = FunctionType::get(Int32, + ArrayRef(BufferSTP), false); + Function *NF = Function::Create(UserFT, GlobalValue::InternalLinkage, + F->getName()+"_compact", M); + Value *APointer = NF->arg_begin(); + + + //Create the marshalling basic block where the marshaslling + //operations take place + BasicBlock *BB = BasicBlock::Create(C, "marshallingblock", NF); + + //get the address of a particular buffer + auto getBufferAddr = [ &APointer, &C, &BB, &Zero ] + (unsigned Indx) { + Value *Indices[] = { + ConstantInt::get(Type::getInt32Ty(C), Indx), + Zero + }; + auto P = + GetElementPtrInst::CreateInBounds(APointer, + ArrayRef(Indices, 2), "", BB); + return new LoadInst(P, "", BB); + }; + + Instruction *LA = getBufferAddr(0); + Instruction *ArgStructP = CastInst::CreatePointerCast( + LA, OFSTP, "", BB); + + //get scalar Call Arg (The first buffer is reserved to contain the scalar + //arguments) + auto getCallArg = [ &ArgStructP, &C, &BB, &Zero ] + (unsigned Indx) { + Value *Indices[] = { Zero, + ConstantInt::get(Type::getInt32Ty(C), Indx) + }; + auto P = + GetElementPtrInst::CreateInBounds(ArgStructP, + ArrayRef(Indices, 2), "", BB); + return new LoadInst(P, "", BB); + }; + + //read scalar or pointer arguments + unsigned BIndex = 1; + SmallVector Args; + for (unsigned I = 0, IE = OFT->getNumParams(); IgetParamType(I); + if (PT->isPtrOrPtrVectorTy()) { + Instruction *BA = getBufferAddr(BIndex++); + Instruction *CBA = CastInst::CreatePointerCast(BA, PT, "", BB); + Args.push_back(CBA); + } else { + Args.push_back( getCallArg(I) ); + } + } + + //update return valuea lambda + auto updateReturnValue = [ &ArgStructP, &C, &BB, &Zero ] + ( unsigned Indx, Value *V) { + Value *Indices[] = { + Zero, + ConstantInt::get(Type::getInt32Ty(C), Indx) + }; + auto P = GetElementPtrInst::CreateInBounds(ArgStructP, + ArrayRef(Indices, 2), "", BB); + return new StoreInst(V, P, BB); + }; + + + //update the return value + CallInst *CI = CallInst::Create(F, Args, "", BB); + if (!OFT->getReturnType()->isVoidTy()) + updateReturnValue(OFT->getNumParams(), CI); + + ReturnInst::Create(C, Zero, BB); + + //inline the original function in the newly generated + //function + InlineFunctionInfo IFI; + bool rv = InlineFunction(CI, IFI); + assert(rv); + + F->eraseFromParent(); + return NF; +} + +/// \brief creates the instructions that compute the computational +/// function index. This index is used to call the requested function. +/// +/// #define CHOOSE_COMP_FUNCTION(hanlder) (((handler) >> 24) & 0x1f) +static Value *injectMethodIndexCompute(LLVMContext &C, Value *dwScalars, + BasicBlock *IB) +{ + ConstantInt *CShifts = ConstantInt::get(Type::getInt32Ty(C), 24); + ConstantInt *CMask = ConstantInt::get(Type::getInt32Ty(C), 0x1f); + + Value *V = + BinaryOperator::Create(Instruction::Shl, dwScalars, CShifts, "", IB); + return BinaryOperator::Create(Instruction::And, V, CMask, "", IB); +} + + +/// \brief it generates a switch case mapping a function index to a +/// function call. +static void injectSwitchCaseCode(LLVMContext &C, SwitchInst *SWI, + Value *BufferPArg, unsigned FIndex, + Function *CF, Function *HF) +{ + ConstantInt *FIndexVal = ConstantInt::get(Type::getInt32Ty(C), FIndex); + BasicBlock *BB = BasicBlock::Create(C, "", HF); + CallInst *CI = CallInst::Create(CF, BufferPArg, "", BB); + + ReturnInst::Create(C, CI, BB); + SWI->addCase(FIndexVal, BB); +} + +Function *HexagonWorkloadAdaptor::generateSkelInvoke(Module *M, + const HexeFunctionInfoListT &FL) +{ + LLVMContext &C = M->getContext(); + Type *Int32 = Type::getInt32Ty(C); + + //error value + Constant *CError = ConstantInt::get(Int32, 20); + + //hexe_skel_invoke_type + Type *IT[] = { Int32, BufferSTP }; + FunctionType *SkelInvokeFT = + FunctionType::get(Int32, ArrayRef(IT, 2), false); + + //create the SkelInvoke function and add a switch + Function *SkelF = + Function::Create(SkelInvokeFT, GlobalValue::ExternalLinkage, + "__hexe_skel_invoke", M); + + auto I = SkelF->arg_begin(); + Value *dwScalarsV = I; + ++I; + Value *BufferPV = I; + + BasicBlock *BB = BasicBlock::Create(C, "", SkelF); + Value *MIndexV = injectMethodIndexCompute(C, dwScalarsV, BB); + + BasicBlock *SwitchDefault = BasicBlock::Create(C, "", SkelF); + ReturnInst::Create(C, CError, SwitchDefault); + SwitchInst *SWI = SwitchInst::Create(MIndexV, SwitchDefault, FL.size(), BB); + + for (auto I = FL.begin(), IE = FL.end(); I!= IE; ++I) { + Function *F; + unsigned Index; + std::tie(F, Index) = *I; + injectSwitchCaseCode(C, SWI, BufferPV, Index, F, SkelF); + } + + + return SkelF; +} + +bool HexagonWorkloadAdaptor::isSupported(const Triple &HTriple, + const Triple &ATriple) +{ + //host and accelerator same triple (used for debug) + if ((HTriple.getArch() == ATriple.getArch()) && + (HTriple.getOS() == ATriple.getOS()) && + (HTriple.getEnvironment() == ATriple.getEnvironment())) + return true; + + //arm 32bits, linux standard gnu abi + if ((HTriple.getArch() == Triple::arm) && + (HTriple.getOS() == Triple::Linux) && + (HTriple.getEnvironment() == Triple::GNUEABI)) + return true; + + return false; +} + +void HexagonWorkloadAdaptor::DataLayoutTripleUpdate(Module *M, Triple HTriple, + Triple ATriple) +{ + if ((HTriple.getArch() == ATriple.getArch()) && + (HTriple.getOS() == ATriple.getOS()) && + (HTriple.getEnvironment() == ATriple.getEnvironment())) { + M->setTargetTriple(HTriple.str()); + return; + } + + if ((HTriple.getArch() == Triple::arm) && + (HTriple.getOS() == Triple::Linux) && + (HTriple.getEnvironment() == Triple::GNUEABI) && + (ATriple.getArch() == Triple::hexagon)) { + M->setTargetTriple(ATriple.str()); + DataLayout D("e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"); + M->setDataLayout(D.getStringRepresentation()); + } +} + +namespace llvm{ + +HexeWorkloadAdaptor *createHexagonWorkloadAdaptor() +{ + return new HexagonWorkloadAdaptor(); +} + +} Index: lib/Transforms/Hexe/Hexe.cpp =================================================================== --- /dev/null +++ lib/Transforms/Hexe/Hexe.cpp @@ -0,0 +1,57 @@ +//===-------- Hexe.cpp - Heterogeneous Execution Engine --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines the initialization function of the Heterogeneous +/// Execution Engine and the command line flags that control the engine +/// operations. +//===----------------------------------------------------------------------===// + + +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" + + +/// initializeHexe - Initialize all passes linked into Hexe library. +void llvm::initializeHexe(llvm::PassRegistry &Registry) +{ + llvm::initializeWorkloadAnalysisPass(Registry); + llvm::initializeWorkloadExtractorPass(Registry); + llvm::initializeWorkloadTransformPass(Registry); +} + +namespace llvm { +cl::opt HexeLoops( + "hexe-loops", + cl::desc("Activates all the eligible loops for offloading (Hexe)"), + cl::init(false)); + +cl::opt HexeFunctionCalls( + "hexe-functioncalls", + cl::desc("Activates all the eligible function calls for offloading" + " (Hexe)"), + cl::init(false)); + +cl::opt HexePolicy( + "hexe-policy", cl::desc("Defines Hexe Offloading Policy"), + cl::value_desc("hexe policy name"), cl::init("") ); + +cl::opt HexeWorkloadFName( + "hexe-workload-fname", cl::desc("Define Hexe Workload filename"), + cl::value_desc("hexe workload filename"), + cl::init("hexe_workload.ll") ); + +cl::opt HexeAdaptor( + "hexe-adaptor", cl::desc("Defines Hexe Accelerator Adaptor"), + cl::value_desc("hexe policy name"), cl::init("") ); + +cl::opt HexeAdaptorCheck( + "hexe-adaptor-check", + cl::desc("Checks the compatibility of the Adaptor (Hexe)"), + cl::init(true)); +} Index: lib/Transforms/Hexe/Utils.cpp =================================================================== --- /dev/null +++ lib/Transforms/Hexe/Utils.cpp @@ -0,0 +1,145 @@ +// ===----- Utils.cpp - Heterogeneous Execution Engine --------*- C++ -*-=== // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===-------------------------------------------------------------------=== // +/// \file +/// This file defines utility functions that: +/// a)create Struct Types based on function interfaces. +/// b)Hexe Metadata read and write functions. +/// c)erase Hexe Metadata from a Module +// ===-------------------------------------------------------------------=== // + + +#include "llvm/Transforms/Hexe/Utils.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" + + +//the operation of the functions is documented in the header file. + +namespace llvm{ + +StructType *getCompactFunctionStruct(FunctionType *FT, LLVMContext &C) +{ + SmallVector StructElemTypes(FT->params().begin(), + FT->params().end()); + + Type *RT = FT->getReturnType(); + if (!RT->isVoidTy()) + StructElemTypes.push_back(RT); + + return StructType::create(StructElemTypes); +} + +/// \brief writeFunctionInfoMD writes information about a specific function +/// manipulated by Hexe as metadata. +static void writeFunctionInfoMD(LLVMContext &C, + const HexeFunctionStructInfoT &FI, NamedMDNode *NN) +{ + Metadata *FM = ConstantAsMetadata::getConstant(std::get<0>(FI)); + Metadata *Findex = + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), + std::get<1>(FI))); + + Metadata *MDA[] = { FM, Findex }; + MDNode *N = MDNode::get(C, ArrayRef(MDA, 2) ); + NN->addOperand(N); +} + + +void readHexeMetadata(Module *M, unsigned &NumHexeFunctions, + HexeFunctionInfoListT &FL, Triple &HTriple) +{ + NamedMDNode *HI = M->getNamedMetadata("hexe.info"); + + //read Number of Hexe Functions + assert(HI); + ConstantAsMetadata *FN = + dyn_cast(HI->getOperand(0)->getOperand(0)); + assert(FN); + ConstantInt *CI = dyn_cast(FN->getValue()); + assert(CI); + NumHexeFunctions = CI->getZExtValue(); + + //read host platform triple + NamedMDNode *HHT = M->getNamedMetadata("hexe.host_triple"); + assert(HHT); + MDString *HTS = dyn_cast(HHT->getOperand(0)->getOperand(0)); + assert(HTS); + HTriple.setTriple(HTS->getString()); + + NamedMDNode *HFL = M->getNamedMetadata("hexe.function_list"); + assert(HFL); + assert(NumHexeFunctions == HFL->getNumOperands()); + + //read Function List Metadata + for (unsigned I = 0, IE = HFL->getNumOperands(); IgetOperand(I); + ConstantAsMetadata *FM = dyn_cast(MD->getOperand(0)); + Function *F = dyn_cast( FM->getValue() ); + + ConstantAsMetadata *FIDM = dyn_cast(MD->getOperand(1)); + ConstantInt *FID = dyn_cast(FIDM->getValue()); + FL.push_back(std::make_tuple(F, FID->getZExtValue())); + } + +} + +void writeHexeMetadata(Module *M, unsigned NumHexeFunctions, + const FunctionHexeFunctionMapT &Mapping, const Triple &HTriple) +{ + LLVMContext &C = M->getContext(); + + NamedMDNode *HI = M->getOrInsertNamedMetadata("hexe.info"); + assert(HI); + + //write Number of Hexe Functions + HI->dropAllReferences(); + Metadata *NumOfFunctions = + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(C), NumHexeFunctions)); + + HI->addOperand(MDNode::get(C, NumOfFunctions)); + + //write host platform triple + NamedMDNode *HHT = M->getOrInsertNamedMetadata("hexe.host_triple"); + assert(HHT); + + HHT->dropAllReferences(); + Metadata *HTripleS = MDString::get(C, HTriple.str()); + HHT->addOperand(MDNode::get(C, HTripleS)); + + //write function metadata + NamedMDNode *HFL = M->getOrInsertNamedMetadata("hexe.function_list"); + assert(HFL); + + std::vector< std::tuple< Function *, unsigned, StructType *> > FIV; + FIV.resize(Mapping.size()); + + for (auto I = Mapping.begin(), IE = Mapping.end(); I!= IE; ++I){ + size_t index = std::get<1>(I->second); + FIV[index] = I->second; + } + + for (auto I = FIV.begin(), IE = FIV.end(); I!= IE; ++I) + writeFunctionInfoMD(C, *I, HFL); +} + +void eraseHexeMetadata(Module *M) +{ + //gets and erase Metadata + auto removeMD = [ &M ]( StringRef MDN) { + M->eraseNamedMetadata(M->getNamedMetadata(MDN)); + }; + + removeMD("hexe.info"); + removeMD("hexe.host_triple"); + removeMD("hexe.function_list"); +} + +} Index: lib/Transforms/Hexe/WorkloadAnalysis.cpp =================================================================== --- /dev/null +++ lib/Transforms/Hexe/WorkloadAnalysis.cpp @@ -0,0 +1,968 @@ +// ===-- WorkloadAnalysis.cpp - Heterogeneous Execution Engine --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// === -------------------------------------------------------------------===// +/// \file +/// Implementation of the Workload Analysis Pass of the Heterogeneous +/// Executon Engine. Please read the header file documentation for high +/// level description. +// === -------------------------------------------------------------------=== // + + +#include "llvm/Transforms/Hexe/WorkloadAnalysis.h" +#include "llvm/Transforms/Hexe/InitializeHexePasses.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" +#include +#include +#include +#include + +#define DEBUG_TYPE "hwa" + +using namespace llvm; + +/// \brief isEligibleArray checks if we support a particular +/// Array Type. +/// +/// \returns true if the Array Type is supported. +static bool isEligibleArray(const Type *T); + +/// \brief isScalarType checks if T is a scalar type. +/// +/// \returns true on success. +static bool isScalarType(const Type *T) +{ + return T->isFloatTy() || T->isDoubleTy() || T->isIntegerTy(); +} + +/// \brief isEligibleStruct checks if we support a particular +/// Struct Type. +/// +/// \returns true if the Struct Type is supported. +static bool isEligibleStruct(const Type *T) +{ + if (!T->isStructTy()) + return false; + + const StructType *ST = dyn_cast(T); + for (auto I = ST->element_begin(), IE = ST->element_end(); I!= IE; ++I) + if ( !isScalarType(*I) && !isEligibleArray(*I) && !isEligibleStruct(*I) ) + return false; + + return true; +} + +/// \brief isEligibleArray checks if we support a particular +/// Array Type. +/// +/// \returns true if the Array Type is supported. +static bool isEligibleArray(const Type *T) +{ + if (!T->isArrayTy()) + return false; + const Type *AT = dyn_cast(T); + const Type *ET = AT->getArrayElementType(); + return isScalarType(ET) || isEligibleStruct(ET); +} + + +/// \brief inspectGlobalVariable checks if we support a particular +/// Global Variable Type. +/// +/// We support any type that is constructed from scalars, arrays +/// and structs. We do not support any pointer type. +/// +/// \returns true if the Global Variable Type is supported. +static bool inspectGlobalVariable(const Value *V) +{ + if (auto GV = dyn_cast(V)) { + Type *T = GV->getType()->getPointerElementType(); + + //We only support instruction, operator users + for (auto I = GV->user_begin(), IE = GV->user_end(); I!= IE; ++I) + { + if (!isa(*I) && !isa(*I)) + return false; + } + + return isScalarType(T) || isEligibleStruct(T) || isEligibleArray(T); + } + return false; +} + +/// \brief inspectInstruction checks if we support a particular instruction +/// for offloading. +/// +/// We do not support Atomic Operations, VAArg operations, function calls +/// and exceptions. However, we do support MemIntrinsics. +/// +/// \returns true if the Instruction is supported. +static bool inspectInstruction(const Instruction &I) +{ + //We do support MemIntrinsic Calls, which they are generated + //either by the user or llvm passes. + if ( isa(I) ) + return true; + + if ( isa(I) || isa(I) || + isa(I) || isa(I) || + isa(I) || isa(I) ) + return false; + + //Otherwise the instruction is supported. + //alloca is supported. We can safely assume all the + //architectures we offload support stack allocations. + return true; +} + +/// \brief inspectBasicBlock checks if we support a particular Basic +/// Block for offloading. +/// +/// We do not support Basic Blocks that contain Atomic Operations, +/// VAArg operations, function calls and exceptions. However, we +/// do support MemIntrinsics. +/// +/// \returns true if the Basic Block is supported. +static bool inspectBasicBlock(const BasicBlock &BB) +{ + return std::all_of(BB.begin(), BB.end(), inspectInstruction); +} + +/// \brief analyzeMemRefValue analyzes a Memory reference value +/// that is retrieved by the Alias Analysis and strips pointer +/// casts, all zero geps and In Bounds Offsets. +/// +/// \returns the Value after stripping. +static Value *analyzeMemRefValue(Value *V) +{ + Value *OV = nullptr; + while (OV!= V) { + OV = V; + //if ( auto *BC = dyn_cast(P) ) + V = V->stripPointerCasts(); + V = V->stripInBoundsConstantOffsets(); + V = V->stripInBoundsOffsets(); + } + + return V; +} + +/// \brief resolveIfPHINode retrieves a Memory Reference +/// Value out of Loop constructs if the input param V +/// is a PHINode. +/// +/// \returns the Memory Reference Value if V is a PhiNode, +/// otherwise the original Value V. +static Value *resolveIfPHINode(Value *V) +{ + if (!isa(V)) + return V; + + PHINode *P = dyn_cast(V); + if (P->getNumIncomingValues()!= 2) + return P; + + Value *IV = analyzeMemRefValue(P->getIncomingValue(0)); + if (IV!= P) { + GetElementPtrInst *GEP = dyn_cast(IV); + if (!GEP) + return P; + if (GEP->getPointerOperand()!= P) + return P; + } + + return resolveIfPHINode( analyzeMemRefValue( P->getIncomingValue(1) ) ); +} + +/// \brief inspectMemAccess investigates if we support a particular memory +/// load or store instruction. +/// +/// If the memory operation is supported, we track the memory segment that +/// the instruction accesses. We track memory segments passed by the +/// function interface (InterfaceMemRefs) and global variables (GlobalMemRefs). +/// +/// \returns true if the memory access is supported. +static bool inspectMemAccess(const Instruction &I, AliasAnalysis *AA, + Function *F, WorkloadMemRefSet &InterfaceMemRefs, + WorkloadMemRefSet &GlobalMemRefs ) +{ + const LoadInst *LI = dyn_cast(&I); + const StoreInst *SI = dyn_cast(&I); + + //If the instruction is not a load or store we do not bother + //analyze further. We return true. + if ( !LI && !SI ) + return true; + + WorkloadMemRef Mref; + + //We use the Alias Analysis to reason about the memory segment. + if (LI) + Mref = MemoryLocation::get(LI); + else + Mref = MemoryLocation::get(SI); + + Value *P = const_cast(Mref.Ptr); + P = analyzeMemRefValue(P); + P = resolveIfPHINode(P); + Mref.Ptr = P; + + //Check if the memory segment is passed by the function interface. + if (auto A = dyn_cast(Mref.Ptr)) { + if (A->getParent() == F){ + InterfaceMemRefs.insert(Mref); + return true; + } + } + + //We check if the memory segment is a global variable + if (inspectGlobalVariable(Mref.Ptr)) { + GlobalMemRefs.insert(Mref); + return true; + } + + //We check if the memory segment is a stack allocation + if (isa(Mref.Ptr)) + return true; + + //Unable to detect the origin of the memory segment + DEBUG(dbgs()<<"Analysis fails (Mem Reference): "<<*(Mref.Ptr)<<"\n" ); + return false; +} + +/// \brief inspectBasicBlockMemAccess investigates if we support +/// the memory accesses of a Basic Block. +/// +/// If the memory operation are supported, we track the memory +/// segments that the operations access. We track memory +/// segments passed by the function interface (InterfaceMemRefs) +/// and global variables (GlobalMemRefs). +/// +/// \returns true if the memory access is supported. +static bool inspectBasicBlockMemAccess(const BasicBlock &BB, + AliasAnalysis *AA, + Function *F, + WorkloadMemRefSet &InterfaceMemRefs, + WorkloadMemRefSet &GlobalMemRefs) +{ + return std::all_of(BB.begin(), BB.end(), + std::bind(inspectMemAccess, + std::placeholders::_1, AA, + F, std::ref(InterfaceMemRefs), + std::ref(GlobalMemRefs))); +} + +/// \brief It performs Code and Memory Reference Analysis +/// on Function F. If the Function is eligible, a Workload +/// Information (Function Type) entry is inserted in the +/// EligibleFunctionWorkloadMap. +/// +/// We analyze every instruction and memory access. If +/// the Function is eligible we keep track of it in +/// EligibleFunctionWorkloadMap. +/// +/// \returns true if the Function F is eligible. +bool WorkloadAnalysis::analyzeCodeMemRefEligibility(Function *F, + FunctionWorkloadT &EligibleFunctionWorkloadMap) +{ + WorkloadInfo WI; + if (!std::all_of(F->begin(), F->end(), inspectBasicBlock)) + return false; + + DEBUG(dbgs()<<"Code Analysis: Function: "<getName()<< + " completed successully\n"); + + if (!std::all_of(F->begin(), F->end(), + std::bind(inspectBasicBlockMemAccess, + std::placeholders::_1, AA, F, + std::ref(WI.InterfaceMemRefs), + std::ref(WI.GlobalMemRefs)))) + return false; + + DEBUG(dbgs()<<"Mem Ref Analysis: Function: "<getName()<< + " completed successully\n"); + + WI.F = F; + + if (WI.GlobalMemRefs.size()) + WI.type = WorkloadType::FunctionWithGlobals; + else + WI.type = WorkloadType::FunctionNoGlobals; + WorkloadInfo *DWI = new WorkloadInfo( std::move(WI) ); + EligibleFunctionWorkloadMap.insert( std::make_pair(F, DWI) ); + return true; +} + +/// \brief It performs Code and Memory Reference Analysis +/// on Loop L. If the Loop is eligible, a Workload +/// Information (Loop Type) entry is inserted in the +/// EligibleLoopWorkloadMap. +/// +/// We analyze every instruction and memory access. If +/// the Loop is eligible we keep track of it in +/// EligibleLoopWorkloadMap. +/// +/// \returns true if the Loop L is eligible. +bool WorkloadAnalysis::analyzeCodeMemRefEligibility(Loop *L, + LoopWorkloadMapT &EligibleLoopWorkloadMap) +{ + WorkloadInfo WI; + if (!std::all_of(L->block_begin(), L->block_end(), + [](const BasicBlock *BB) { return inspectBasicBlock(*BB); } )) + return false; + + Function *F = L->getHeader()->getParent(); + DEBUG(dbgs()<<"Code Analysis: Function "<getName()<<" Loop: "<< + L->getHeader()->getName()<<" completed successully\n"); + + auto inspect = [&](const BasicBlock *BB) { + return inspectBasicBlockMemAccess(*BB, AA, F, + std::ref(WI.InterfaceMemRefs), + std::ref(WI.GlobalMemRefs)); + }; + + if (!std::all_of(L->block_begin(), L->block_end(), inspect )) + return false; + + DEBUG(dbgs()<<"Mem Ref Analysis: Function: "<getName()<<" Loop: " + <getHeader()->getName()<<" completed successully\n"); + + WI.L = L; + WI.F = F; + + if (WI.GlobalMemRefs.size()) + WI.type = WorkloadType::LoopWithGlobals; + else + WI.type = WorkloadType::LoopNoGlobals; + + //We use CodeExtractor to check if we can extract + //a Loop, we do not perform any code modification + //this is an analysis pass! + CodeExtractor CE(*DT, *L); + if (!CE.isEligible()) + return false; + + WorkloadInfo *DWI = new WorkloadInfo( std::move(WI) ); + EligibleLoopWorkloadMap.insert( std::make_pair(L, DWI) ); + + return true; +} + + +/// \brief It performs Code And Memory Reference Analysis on a +/// Function and its nested Loops. +/// +/// We keep track of Eligible Functions and Loops in +/// EligibleFunctionWorkloadMap and EligibleLoopWorkloadMap. +void WorkloadAnalysis::analyzeCodeMemRef(Function &F, + FunctionWorkloadT + &EligibleFunctionWorkloadMap, + LoopWorkloadMapT &EligibleLoopWorkloadMap) +{ + //If it is a function declaration we skip + if (F.isDeclaration()) + return; + + //if the function returns a non scalar type, it is not + //supported + if (isa(F.getReturnType())) + return; + + AA = &getAnalysis(); + LoopInfoCache.push_back( + std::move( getAnalysis(F).getLoopInfo() ) ); + LI = &LoopInfoCache.back(); + DT = &getAnalysis(F).getDomTree(); + + + //We analyze the nested Loops of the function in a buttom up + //manner. If a loop is not eligible, we already know that + //its parent loops and the function are not eligible either. + //In this case we stop analyzing. + std::stack Loops; + std::set Skip; + std::set Visited; + bool skipFunction = false; + + for (auto I = LI->begin(), IE = LI->end(); I!= IE; ++I) + Loops.push(*I); + + while (Loops.size()){ + Loop *L = Loops.top(); + if (!Visited.count(L)){ + for (auto I = L->begin(), IE = L->end(); I!= IE; ++I) + Loops.push(*I); + Visited.insert(L); + } else { + Loops.pop(); + + if (!Skip.count(L) && + !analyzeCodeMemRefEligibility(L, EligibleLoopWorkloadMap) ) { + Loop *LP = L->getParentLoop(); + while (LP){ + Skip.insert(LP); + LP = LP->getParentLoop(); + } + skipFunction = true; + } + } + } + + if (!skipFunction) + analyzeCodeMemRefEligibility(&F, EligibleFunctionWorkloadMap); +} + +/// \brief It maps the memory references of a function to +/// the Memory Allocations used in a particular function call +/// context. +/// +/// A memory allocation can be a Global Variable, a +/// dynamic allocation (malloc) or a stack allocation (alloca). +static bool +inspectFunctionCallMemAllocUse(const CallInst *CI, + AliasAnalysis *AA, + Function *Caller, + WorkloadMemRefSet &FInterfaceMemRefs, + SetVector &GlobalAllocations, + SetVector &MallocAllocations, + SetVector &AllocaAllocations, + TargetLibraryInfo *TLI) +{ + for (auto I = FInterfaceMemRefs.begin(), + IE = FInterfaceMemRefs.end(); I!= IE; ++I) { + + WorkloadMemRef FMref, Mref; + FMref = *I; + assert(isa(FMref.Ptr)); + const Argument *Arg = dyn_cast(FMref.Ptr); + + AliasAnalysis::ModRefResult Mask = AliasAnalysis::ModRef; + Mref = AA->getArgLocation(CI, Arg->getArgNo(), Mask); + + Value *P = const_cast(Mref.Ptr); + P = analyzeMemRefValue(P); + P = resolveIfPHINode(P); + + //We check if we refer to a global variable + if (inspectGlobalVariable(P)){ + GlobalVariable *GV = dyn_cast(P); + GlobalAllocations.insert(GV); + continue; + } + + //malloc allocations + if (CallInst *MC = extractMallocCall(P, TLI)) { + MallocAllocations.insert(MC); + continue; + } + + //alloca instruction + if (AllocaInst *AI = dyn_cast(P)) { + AllocaAllocations.insert(AI); + continue; + } + + //We check if the memory reference is passed from + //the function interface. we don't support it yet + if (auto A = dyn_cast(P)){ + if (A->getParent() == Caller){ + return false; + //continue; + } + } + DEBUG(dbgs()<<"Analysis fails (Mem Allocation): "<<*P<<"\n"); + //Unable to detect the origin of the memory + return false; + } + + return true; +} + +/// \brief It performs Memory Allocation Use Analysis on +/// Function F. It analyzes all the call sites of Function F. +/// For each call site we map the memory references of the +/// function with the Memory Allocations used in the particular +/// function call context. +/// +/// A memory allocation can be a Global Variable, a dynamic +/// allocation (malloc) or a stack allocation (alloca). +void +WorkloadAnalysis::analyzeMemAllocUseEligibility(Function *F, + WorkloadInfo *WI, + FunctionCallersSet &Callers) +{ + assert( WI->isFunction() ); + + WorkloadInfo CWI = *WI; + + for (auto I = Callers.begin(), IE = Callers.end(); I!= IE; ++I){ + //AliasAnalysis is a Group Pass interface, an AA implementation may be + //a function or module pass, we get the analysis explicitly for the + //function to guarantee correct data. + Value *V; + Function *CallerF; + std::tie(V, CallerF) = *I; + + if (!V) + continue; + assert( isa(V) || isa(V) ); + + //we do not support code with exceptions for offloading + if (isa(V)) + continue; + + //We know it is a call instruction + CallInst *CI = dyn_cast(V); + + WorkloadInfo CWI = *WI; + CWI.CI = CI; + + auto insertGlobal = [&](const WorkloadMemRef &I) { + assert( inspectGlobalVariable(I.Ptr) ); + CWI.GlobalAllocations.insert( + dyn_cast( const_cast(I.Ptr) ) ); + }; + + //global variable accesses + std::for_each(WI->GlobalMemRefs.begin(), WI->GlobalMemRefs.end(), + insertGlobal); + + if (inspectFunctionCallMemAllocUse(CI, AA, CallerF, WI->InterfaceMemRefs, + CWI.GlobalAllocations, + CWI.MallocAllocations, + CWI.AllocaAllocations, TLI)) { + + CallWorkloadMap.insert(std::make_pair(CI, + new WorkloadInfo(std::move(CWI)))); + ECalls[F].insert(CI); + + DEBUG(dbgs()<<"Mem Allocation Analysis: Callee Function: "<getName() + <<" Caller Function: "<getName()<<" completed successfully\n"); + } + //else not eligible we skip it + } +} + +/// \brief It performs Memory Allocation Use Analysis on Loop +/// L. It analyzes all the call sites of its parent Function. +/// For each call site we map the memory references of the +/// Loop with the Memory Allocations used in the particular +/// function call context. If we can successfully analyze +/// all the call sites, this Loop is considered eligible. +/// +/// A memory allocation can be a Global Variable, a dynamic +/// allocation (malloc) or a stack allocation (alloca). +void +WorkloadAnalysis::analyzeMemAllocUseEligibility(Loop *L, + WorkloadInfo *WI, + FunctionCallersSet &Callers) +{ + assert( WI->isLoop() ); + + WorkloadInfo CWI = *WI; + + if (!Callers.size()) + return; + + for (auto I = Callers.begin(), IE = Callers.end(); I!= IE; ++I) { + //AliasAnalysis is a Group Pass interface, an AA implementation may be + //a function or module pass, we get the analysis explicitly for the + //function to guarantee correct data. + Value *V; + Function *CallerF; + std::tie(V, CallerF) = *I; + + if (!V) + continue; + + assert( isa(V) || isa(V) ); + + //we do not support code with exceptions for offloading + if (isa(V)) + continue; + + //We know it is a call instruction + CallInst *CI = dyn_cast(V); + + auto insertGlobal = [&](const WorkloadMemRef &I) { + assert( inspectGlobalVariable(I.Ptr) ); + CWI.GlobalAllocations.insert( + dyn_cast( const_cast(I.Ptr) ) ); + }; + + //global variable accesses + std::for_each(WI->GlobalMemRefs.begin(), WI->GlobalMemRefs.end(), + insertGlobal); + + // if we cannot reason for a partical call of the function we abort, + //the loop is not eligible + if (!inspectFunctionCallMemAllocUse(CI, AA, CallerF, + WI->InterfaceMemRefs, + CWI.GlobalAllocations, + CWI.MallocAllocations, + CWI.AllocaAllocations, TLI)) + return; + } + + LoopWorkloadMap.insert( + std::make_pair(WI->getLoop(), new WorkloadInfo(std::move(CWI)) ) ); + ELoops[WI->getFunction()].insert(L); + + DEBUG(dbgs()<<"Mem Allocation Analysis: Callee Function: "<< + WI->getFunction()->getName()<<" Loop: "<< + L->getHeader()->getName()<<" completed successfully\n"); +} + + +/// \brief It reverses the call graph information for the functions +/// we want to analyze their call sites. +static void +reverseCallGraphInfo(std::map &CallInfo, + const FunctionWorkloadT &EligibleFunctionWorkloadMap, + const LoopWorkloadMapT &EligibleLoopWorkloadMap, + CallGraph *CG) +{ + for (auto I = EligibleFunctionWorkloadMap.begin(), + IE = EligibleFunctionWorkloadMap.end(); I!= IE; ++I) + CallInfo[I->first]; + + for (auto I = EligibleLoopWorkloadMap.begin(), + IE = EligibleLoopWorkloadMap.end(); I!= IE; ++I) + CallInfo[I->first->getHeader()->getParent()]; + + WeakVH WV; + CallGraphNode *Callee; + CallGraphNode *ECN, *CEN; + + ECN = CG->getExternalCallingNode(); + CEN = CG->getCallsExternalNode(); + + for (auto I = CG->begin(), IE = CG->end(); I!= IE; ++I) { + if (I->second == ECN || I->second == CEN) + continue; + Function *CallerF = I->second->getFunction(); + for (auto J = I->second->begin(), JE = I->second->end(); J!= JE; ++J) { + std::tie(WV, Callee) = *J; + Function *CalleeF = Callee->getFunction(); + auto CI = CallInfo.find(CalleeF); + if (CI!= CallInfo.end()) + CI->second.insert(std::make_tuple(WV, CallerF)); + } + } +} + +bool WorkloadAnalysis::runOnModule(Module &M) +{ + CG = &getAnalysis().getCallGraph(); + TLI = &getAnalysis().getTLI(); + + // 1) We perform Code and Memory Reference Analysis on + // every Function definition and its nested Loops. + for (auto I = M.begin(), IE = M.end(); I!= IE; ++I) + analyzeCodeMemRef(*I, EligibleFunctionWorkloadMap, + EligibleLoopWorkloadMap); + + // We get function call sites in a convenient representation + std::map CallInfo; + reverseCallGraphInfo(CallInfo, EligibleFunctionWorkloadMap, + EligibleLoopWorkloadMap, CG); + + // 2) We perform Memory Allocation Use Analysis on Functions that passed + // the first step, the Code and Memory Reference Analysis. + for (auto I = EligibleFunctionWorkloadMap.begin(), + IE = EligibleFunctionWorkloadMap.end(); I!= IE; ++I) + analyzeMemAllocUseEligibility(I->first, I->second, CallInfo[I->first]); + + // 2) We perform Memory Allocation Use Analysis on Loops that passed + // the first step, the Code and Memory Reference Analysis. + for (auto I = EligibleLoopWorkloadMap.begin(), + IE = EligibleLoopWorkloadMap.end(); I!= IE; ++I) + analyzeMemAllocUseEligibility(I->first, I->second, + CallInfo[I->first->getHeader()->getParent()]); + + return false; +} + +template < class T> +void deleteObjects(T &Workload) +{ + for (auto I = Workload.begin(), IE = Workload.end(); I!= IE; ++I) + delete I->second; + Workload.clear(); +} + +void WorkloadAnalysis::releaseMemory() +{ + deleteObjects(EligibleFunctionWorkloadMap); + deleteObjects(EligibleLoopWorkloadMap); + + deleteObjects(CallWorkloadMap); + deleteObjects(LoopWorkloadMap); + + ECalls.clear(); + ELoops.clear(); + + std::for_each(GarbageCollector.begin(), GarbageCollector.end(), + [](WorkloadInfo *I) { delete I; } ); + + deleteObjects(CallWorkloadMap); + GarbageCollector.clear(); +} + +template < class T, class U > +static void eraseWorkload(T &Workloads, const U &Key) +{ + auto I = Workloads.find(Key); + if (I == Workloads.end()) + return; + delete I->second; + Workloads.erase(I); +} + + +bool WorkloadAnalysis::setForOffloading(WorkloadInfo *WI) +{ + if ( WI->isCall() && CallWorkloadMap.count(WI->getCall()) ) { + Function *F = WI->getFunction(); + GarbageCollector.push_back(WI); + CallWorkloadMap.erase(WI->getCall()); + //find and remove the loops of the function that are eligible + //for offloading. + std::set &Loops = ELoops[F]; + for (auto I = Loops.begin(), IE = Loops.end(); I!= IE; ++I) + eraseWorkload(LoopWorkloadMap, *I); + ELoops.erase(F); + + return true; + } else if ( WI->isLoop() && LoopWorkloadMap.count(WI->getLoop()) ) { + GarbageCollector.push_back(WI); + LoopWorkloadMap.erase(WI->getLoop()); + Loop *L = WI->getLoop(); + + //Remove parent function calls from the eligible workloads + Function *PF = L->getHeader()->getParent(); + std::set &Calls = ECalls[PF]; + for (auto I = Calls.begin(), IE = Calls.end(); I!= IE; ++I) + eraseWorkload(CallWorkloadMap, *I); + ECalls.erase(PF); + + //Remove eligibles loops for the same function + std::set &Loops = ELoops[PF]; + for (auto I = Loops.begin(), IE = Loops.end(); I!= IE; ++I) + eraseWorkload(LoopWorkloadMap, *I); + ELoops.erase(PF); + + return true; + } + return false; +} + +bool WorkloadAnalysis::MutateLoopToCallWorkload(WorkloadInfo *WI, + Function *F, CallInst *CI) +{ + bool RV; + WI->F = F; + WI->CI = CI; + if (WI->type == WorkloadType::LoopNoGlobals) + WI->type = WorkloadType::CallNoGlobals; + else + WI->type = WorkloadType::CallWithGlobals; + + WI->InterfaceMemRefs.clear(); + WI->GlobalMemRefs.clear(); + + RV = std::all_of(F->begin(), F->end(), + std::bind(inspectBasicBlockMemAccess, + std::placeholders::_1, + AA, F, std::ref(WI->InterfaceMemRefs), + std::ref(WI->GlobalMemRefs))); + + assert(RV); + return RV; +} + +/// \brief It orders the values alphabetically for printing. +static void printOrdered(llvm::raw_ostream &O, + std::vector &Names) +{ + std::sort(Names.begin(), Names.end()); + for (auto I = Names.begin(), IE=Names.end(); I != IE; ++I) + O<<*I<<" "; +} + +//reportMemRef reports the Memory References of a Workload +static void reportMemRef(llvm::raw_ostream &O, WorkloadInfo *WI) +{ + std::vector Names; + + O<<"\t\tInterfaceMemRefs: "; + for (auto J = WI->getInterfaceMemRefs().begin(), + JE = WI->getInterfaceMemRefs().end(); J!= JE; ++J) + Names.push_back(J->Ptr->getName()); + + printOrdered(O,Names); + Names.clear(); + + O<<"\n\t\tGlobalMemRefs: "; + for (auto J = WI->getGlobalMemRefs().begin(), + JE = WI->getGlobalMemRefs().end(); J!= JE; ++J) + Names.push_back(J->Ptr->getName()); + + printOrdered(O,Names); + O<<"\n"; +} + +//reportAllocaRef reports the Memory Allocations accessed by a Workload +static void reportAllocaRef(llvm::raw_ostream &O, WorkloadInfo *WI) +{ + std::vector Names; + + O<<"\t\tGlobalAllocations: "; + for (auto J = WI->getGlobalAllocations().begin(), + JE = WI->getGlobalAllocations().end(); J!= JE; ++J) + Names.push_back((*J)->getName()); + + printOrdered(O,Names); + Names.clear(); + + O<<"\n\t\tMallocAllocations: "; + for (auto J = WI->getMallocAllocations().begin(), + JE = WI->getMallocAllocations().end(); J!= JE; ++J) + Names.push_back((*J)->getName()); + + printOrdered(O,Names); + Names.clear(); + + O<<"\n\t\tAllocaAllocations: "; + for (auto J = WI->getAllocaAllocations().begin(), + JE = WI->getAllocaAllocations().end(); J!= JE; ++J) + Names.push_back((*J)->getName()); + + printOrdered(O,Names); + O<<"\n"; +} + + +void WorkloadAnalysis::print(llvm::raw_ostream &O, const Module *M) const +{ + typedef std::tuple Info; + std::vector PrintInfo; + auto InfoCmp = [](const Info &A, const Info &B) { + return std::get<0>(A).compare(std::get<0>(B)) < 0; + }; + + O<<"Eligible Functions (Code and Memory Reference Analysis):\n"; + for (auto I = EligibleFunctionWorkloadMap.begin(), + IE = EligibleFunctionWorkloadMap.end(); I!= IE; ++I) { + PrintInfo.push_back(std::make_tuple(I->first->getName(), I->second)); + } + + std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp); + for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) { + O<<"\t"<(*I)<<":\n"; + reportMemRef(O, std::get<1>(*I)); + } + O<<"-----------------------------------\n"; + + O<<"Eligible Loops (Code and Memory Reference Analysis):\n"; + PrintInfo.clear(); + + for (auto I = EligibleLoopWorkloadMap.begin(), + IE = EligibleLoopWorkloadMap.end(); I!= IE; ++I) { + std::string Key = I->first->getHeader()->getName(); + Key += ", Parent Function: "; + Key += I->first->getHeader()->getParent()->getName(); + PrintInfo.push_back( std::make_pair(Key, I->second) ); + } + + std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp); + for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) { + O<<"\t"<(*I)<<":\n"; + reportMemRef(O, std::get<1>(*I)); + } + O<<"-----------------------------------\n"; + + O<<"Eligible Functions (Memory Allocation Use Analysis):\n"; + PrintInfo.clear(); + + for (auto I = CallWorkloadMap.begin(), + IE = CallWorkloadMap.end(); I!= IE; ++I) { + std::string Key = I->first->getName(); + Key += ", Callee: "; + Key += I->second->getFunction()->getName(); + Key += " Caller: "; + Key += I->first->getParent()->getParent()->getName(); + PrintInfo.push_back( std::make_pair(Key, I->second) ); + } + + std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp); + for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) { + O<<"\t"<(*I)<<":\n"; + reportAllocaRef(O, std::get<1>(*I)); + } + O<<"-----------------------------------\n"; + + O<<"Eligible Loops (Memory Allocation Use Analysis):\n"; + PrintInfo.clear(); + + for (auto I = LoopWorkloadMap.begin(), + IE = LoopWorkloadMap.end(); I!= IE; ++I) { + std::string Key = I->first->getHeader()->getName(); + Key += ", Parent Function: "; + Key += I->first->getHeader()->getParent()->getName(); + PrintInfo.push_back( std::make_pair(Key, I->second) ); + } + + std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp); + for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) { + O<<"\t"<(*I)<<":\n"; + reportAllocaRef(O, std::get<1>(*I)); + } + O<<"-----------------------------------\n"; +} + +char WorkloadAnalysis::ID = 1; //just because everyone sets it to 0 +INITIALIZE_PASS_BEGIN(WorkloadAnalysis, + "hexe-analysis", "Hexe Workload Analysis", false, true) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(WorkloadAnalysis, "hexe-analysis", + "Hexe Workload Analysis", false, true) + +ModulePass *createWorkloadAnalysisPass() +{ + return new WorkloadAnalysis(); +} + +WorkloadAnalysis::WorkloadAnalysis():ModulePass(ID), AA(nullptr), + LI(nullptr), DT(nullptr), CG(nullptr) +{ + initializeWorkloadAnalysisPass(*PassRegistry::getPassRegistry()); +} + +void WorkloadAnalysis::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + Index: lib/Transforms/Hexe/WorkloadExtractor.cpp =================================================================== --- /dev/null +++ lib/Transforms/Hexe/WorkloadExtractor.cpp @@ -0,0 +1,1222 @@ +// === WorkloadExtractor.cpp - Heterogeneous Execution Engine -*- C++ -*-=== // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// === ------------------------------------------------------------------=== // +/// \file +/// Implementation of the Workload Extraction Utilities and Pass of the +/// Heterogeneous Execution Engine. Please read the header file documentation +/// for high level description. +// === ------------------------------------------------------------------=== // + + +#include "llvm/Transforms/Hexe/WorkloadAnalysis.h" +#include "llvm/Transforms/Hexe/WorkloadExtractor.h" +#include "llvm/Transforms/Hexe/InitializeHexePasses.h" +#include "llvm/Transforms/Hexe/Hexe.h" +#include "llvm/Transforms/Hexe/Utils.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/DominanceFrontier.h" +#include "llvm/Bitcode/ReaderWriter.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include + + +#define DEBUG_TYPE "hwe" + +using namespace llvm; +HexeWorkload::HexeWorkload(LLVMContext &C, StringRef MSID) + : C(C), MSID(MSID), DL(nullptr), TargetTriple(nullptr), M(nullptr) +{ + nextFunctionID = 0; + M = new Module(MSID, C); +} + +void HexeWorkload::setDLandTriple(const DataLayout &DL, + const Triple &TargetTriple) +{ + delete this->DL; + this->DL = new DataLayout(DL); + delete this->TargetTriple; + this->TargetTriple = new Triple(TargetTriple); +} + +void HexeWorkload::writeFile(StringRef Filename) +{ + assert( DL && TargetTriple ); + //writes Hexe Metadata on the module + writeHexeMetadata(M, nextFunctionID, Mapping, *TargetTriple); + + //keeps the original host DataLayout + //sets special Hexe Target Triple + M->setDataLayout(DL->getStringRepresentation()); + M->setTargetTriple("hexe-unknown--unknown"); + + //writes the module to File + std::error_code EC; + raw_fd_ostream fs(Filename, EC, sys::fs::F_None); + WriteBitcodeToFile(M, fs); +} + +void HexeWorkload::writeModule(Module *M) +{ + assert( DL && TargetTriple ); + + //writes Hexe Metadata on the module + writeHexeMetadata(M, nextFunctionID, Mapping, *TargetTriple); + + //keeps the original host DataLayout + //sets special Hexe Target Triple + M->setDataLayout(DL->getStringRepresentation()); + M->setTargetTriple("hexe-unknown--unknown"); +} + + +HexeFunctionStructInfoT +HexeWorkload::addFunctionPrototype(Function *F, FunctionType *FT) +{ + assert( Mapping.find(F) == Mapping.end() ); + + Function *EF = + Function::Create(FT, GlobalValue::AvailableExternallyLinkage, + F->getName(), M); + StructType *ST = getCompactFunctionStruct(FT, C); + auto T = std::make_tuple(EF, nextFunctionID++, ST); + Mapping.insert(std::make_pair(F, T)); + + return T; +} + + +/// In principal, we do not support offloading of loops or +/// functions that contain function calls. However, we +/// do an exception for Memory Intrinsics. This group +/// of intrinsics performs standard memory operations +/// which are supported across the various LLVM +/// Targets. This function injects valid Function +/// Declarations for the Memory intrinsics that are +// used in the Hexe Module. +void HexeWorkload::validateSupportedFunctionCalls() +{ + std::map > Registry; + + //function iterator + for (auto I = M->begin(), IE = M->end(); I!= IE; ++I) + //BasicBlock iterator + for (auto J = I->begin(), JE = I->end(); J!= JE; ++J) + // Instruction Iterator + for (auto K = J->begin(), KE = J->end(); K!= KE; ++K) + if ( auto CI = dyn_cast(K) ) + Registry[CI->getCalledFunction()].push_back(CI); + + + for (auto I = Registry.begin(), IE = Registry.end(); I!= IE; ++I) { + Function *LF = Function::Create(I->first->getFunctionType(), + I->first->getLinkage(), + I->first->getName(), M); + LF->setCallingConv(I->first->getCallingConv()); + LF->setAttributes(I->first->getAttributes()); + + for (auto J = I->second.begin(), JE = I->second.end(); J!= JE; ++J) + (*J)->setCalledFunction(LF); + } +} + + +/// \brief It detects the Call Instruction in Function \param +/// Caller that calls the Function \param Callee. +/// +/// When we extract a Loop it gets converted into a function. We +/// use findCallInst to detect the call instruction in the original +/// Function that calls the extracted Loop function. +static CallInst *findCallInst(Function *Caller, Function *Callee) +{ + for (auto I = Caller->begin(), IE = Caller->end(); I!= IE; ++I) { + for (auto J = I->begin(), JE = I->end(); J!= JE; ++J) { + if ( CallInst *CI = dyn_cast(J) ) { + if (CI->getCalledFunction() == Callee) + return CI; + } + } + } + + assert( 0 && "We shouldn't reach here" ); + return nullptr; +} + +/// \brief It extracts a Loop and creates a new function that +/// solely contains the Loop. It then contacts the Workload +/// Analysis to update the Workload Info for this Workload. +/// +/// Use case: If we decide to offload a Loop workload, we +/// convert it to a Function Call Workload and we then +/// treat it as a Function Call Offloading. +bool WorkloadExtractUtil::loopToFunction(WorkloadInfo *WI, + WorkloadInfo *CW, + DominatorTree *DT) +{ + //we extract the loop as a function + Loop *L = WI->getLoop(); + Function *PF = L->getHeader()->getParent(); + CodeExtractor CE(*DT, *L, false); + assert( CE.isEligible() ); + Function *LF = CE.extractCodeRegion(); + DT->verifyDomTree(); + + CallInst *CI = findCallInst(PF, LF); + *CW = *WI; + + // We update the Workload Info contents + WA->MutateLoopToCallWorkload(CW, LF, CI); + return true; +} + + +/// \brief This function clones a function to the Hexe Workload Module. +//// +/// We perform the following on the cloned Function: +/// If the function accesses Global Variables we need to adjust the +/// function interface. We append an argument for every Global Variable +/// the function accesses. The argument has the same type as the Global +/// Variable. We update all the Global Variable references to point +/// to the newly added arguments. +/// The original function remains intact. +Function *WorkloadExtractUtil::cloneOrGetHexeFunction(WorkloadInfo *WI) +{ + // If a Hexe function has already been cloned we return the Hexe + // function. + if (auto I = HW->getMapping(WI->getFunction())) + return std::get<0>( *I ); + + Function *OF = WI->getFunction(); + FunctionType *OFT = OF->getFunctionType(); + FunctionType *EFT; + std::queue SValues; + + // If the function accesses Global Variables we need to adjust the + // function interface. We append an argument for every Global Variable + // the function accesses. The argument should have the same type + // as the Global Variable. + if (WI->getGlobalMemRefs().size()) { + SmallVector ArgTypes; + for (auto I = OFT->param_begin(), IE = OFT->param_end(); I!= IE; ++I) + ArgTypes.push_back(*I); + + WorkloadMemRefSet GV = WI->getGlobalMemRefs(); + for (auto I = GV.begin(), IE = GV.end(); I!= IE; ++I){ + ArgTypes.push_back( I->Ptr->getType() ); + SValues.push(I->Ptr); + } + + EFT = FunctionType::get( OFT->getReturnType(), ArgTypes, false); + } else + EFT = OFT; + + // We add a new Function Prototype on Hexe Workload. + Function *EF; + std::tie(EF, std::ignore, std::ignore) = HW->addFunctionPrototype(OF, EFT); + + // We clone the function body. We update all the Global Variable references + // to point to the newly added arguments. + ValueToValueMapTy VMap; + Function::ArgumentListType &OFArgs = OF->getArgumentList(); + Function::ArgumentListType &EFArgs = EF->getArgumentList(); + + auto J = EFArgs.begin(); + for (auto I = OFArgs.begin(), IE = OFArgs.end(); I!= IE; ++I, ++J) + VMap.insert( std::make_pair( I, WeakVH(J) ) ); + + for (auto JE = EFArgs.end(); J!= JE; ++J) { + VMap.insert( std::make_pair(SValues.front(), WeakVH(J)) ); + SValues.pop(); + } + + SmallVector Returns; + CloneFunctionInto(EF, OF, VMap, false, Returns, ""); + + return EF; +} + +// Marshalling of function call arguments and global variables +// +// Each time we need to dispatch a workload for computation on the +// accelerator we need to "pack" the call arguments and global +// variable references on a buffer. The reason for doing that +// is that the runtime library function calls that perform the +// offloading provide a generic interface, which is agnostic +// of the particular Workload. At this step we treat pointer +// arguments as scalars and we simply copy their values. The +// runtime is responsible to convert them to valid values in +// the execution context of the accelerator. +// We represent the dispatch data in the following format in +// the buffer: +// ___________________ +// | Call Arg 0 | +// | Call Arg 1 | +// | Call Arg 2 | +// | ... | +// | Global Var Ref 0 | +// | Global Var Ref 1 | +// | .... | +// | Return Value | +// |___________________| + +/// \brief this function marshals the Function Call Arguments +/// and the Global Variable references of the function call +/// on a buffer. At this point we treat pointer values as +/// scalars where just copy the value. This operation is +/// performed before dispatching a workload execution on the +/// accelerator. +static void prologMarshalling(Instruction *II, Value *StructPtr, + StructType *ST, WorkloadInfo *WI, + CallInst *OCI, LLVMContext &C) +{ + //lambda function for writing a particular argument on the buffer + ConstantInt *ZeroIndex = ConstantInt::get(Type::getInt32Ty(C), 0); + auto gepstore = [ &StructPtr, &C, &II, &ST, &ZeroIndex ] + ( unsigned Indx, Value *In) { + Value *indices[] = { ZeroIndex, + ConstantInt::get(Type::getInt32Ty(C), Indx) }; + auto P = + GetElementPtrInst::CreateInBounds(StructPtr, + ArrayRef(indices, 2), "", II); + return new StoreInst(In, P, II); + }; + + unsigned Indx = 0; + + //we marshal the original arguments + for (unsigned I = 0, IE = OCI->getNumArgOperands(); I!= IE; ++I) + gepstore(Indx++, OCI->getArgOperand(I)); + + //We append the Global Variable references + for (auto I = WI->getGlobalMemRefs().begin(), + IE = WI->getGlobalMemRefs().end(); I!= IE; ++I) + gepstore(Indx++, const_cast(I->Ptr)); +} + +/// \brief This function is the complement of prologMarshalling. It +/// is called after the completion of a workload dispatch and +/// reads the return values back from the buffer. +static Instruction *epilogueMarshalling(Instruction *II, Value *StructPtr, + StructType *ST, WorkloadInfo *WI, + CallInst *OCI, LLVMContext &C) +{ + //lambda function for reading a value from the buffer + ConstantInt *ZeroIndex = ConstantInt::get(Type::getInt32Ty(C), 0); + auto gepload = [ &StructPtr, &C, &II, &ST, &ZeroIndex ] + ( unsigned Indx) { + Value *indices[] = { ZeroIndex, + ConstantInt::get(Type::getInt32Ty(C), Indx) }; + auto P = + GetElementPtrInst::CreateInBounds(StructPtr, + ArrayRef(indices, 2), "", II); + return new LoadInst(P, "", II); + }; + + //if the function type returns a value we read it from the buffer. + FunctionType *FT = OCI->getCalledFunction()->getFunctionType(); + if (FT->getReturnType()->isVoidTy()) + return nullptr; + + return gepload( ST->getNumElements() -1 ); +} + + + +// Passing Memory Access Information to the runtime library. +// +// We pass information about the memory accesses of the Workload +// to the runtime. The runtime may use this information for a number +// of operations such as enforce coherency, perform runtime checks +// or data transfers. +// +// The following struct type provides information for a single memory +// access. +// struct MemAccessInfo{ +// void *ptr; //a pointer to the accessed memory segment. +// unsigned size; //a static analysis estimation about the segment size +// unsigned arg_order; // the argument of the Hexe Function (the cloned, +// // extracted function version) that gets mapped to +// // this segment. +// access_mode; // the access mode, read 0, write 1, readwrite 3; +// }; +// We provide an array of this struct to the runtime. Each entry represents +// a different memory segment. +// +// + + +/// \brief It allocates the required memory for the Memory Access Info Array +/// on the stack. +static Value *MemAccessInfoBufferAlloc(Instruction *II, WorkloadInfo *WI, + LLVMContext &C, + StructType *MemoryAccessInfoT) +{ + size_t N = WI->getInterfaceMemRefs().size() + WI->getGlobalMemRefs().size(); + + if (!N) + return ConstantPointerNull::get(PointerType::getUnqual(MemoryAccessInfoT)); + + Value *NV = ConstantInt::get(Type::getInt32Ty(C), N); + return new AllocaInst(MemoryAccessInfoT, NV, "", II); +} + +/// \brief It writes the data of a particular Memory Access Info Array element. +static Instruction *writeMemAccessInfo(Instruction *II, LLVMContext &C, + Value *StructArP, Value *MemPtr, + unsigned MemSize, unsigned ArrayIndex, + unsigned ArgOrder, + MemAccessInfoAccessMode Mode) +{ + // lambda function that writes a single field of the MemAccessInfo + // struct at a time. + auto gepstore = [ &StructArP, &C, &II ] + ( unsigned Indx0, unsigned Indx1, Value *In) { + Value *indices[] = { + ConstantInt::get(Type::getInt32Ty(C), Indx0), + ConstantInt::get(Type::getInt32Ty(C), Indx1) + }; + auto P = + GetElementPtrInst::CreateInBounds(StructArP, + ArrayRef(indices, 2), "", II); + return new StoreInst(In, P, II); + }; + + //set ptr + Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) ); + Value *PtrC = new BitCastInst(MemPtr, VoidPtr, "", II); + gepstore(ArrayIndex, 0, PtrC); + //set size + Value *MemSizeV = ConstantInt::get(Type::getInt32Ty(C), MemSize); + gepstore(ArrayIndex, 1, MemSizeV); + //set arg_order + Value *ArgOrderV = ConstantInt::get(Type::getInt32Ty(C), ArgOrder); + gepstore(ArrayIndex, 2, ArgOrderV); + //set access mode + Value *AccessModeV = ConstantInt::get(Type::getInt8Ty(C), + static_cast(Mode) ); + return gepstore(ArrayIndex, 3, AccessModeV); +} + +/// \brief It allocates and sets the Memory Access Info Array +static Value* +allocateAndSetMemAccessInfoArray(Instruction *II, CallInst *CI, + WorkloadInfo *WI, LLVMContext &C, + StructType *MemoryAccessInfoT) +{ + //array allocation + Value *StructArray = MemAccessInfoBufferAlloc(II, WI, C, MemoryAccessInfoT); + + if (!( WI->getInterfaceMemRefs().size() + WI->getGlobalMemRefs().size())) + return StructArray; + + unsigned ArgOrder = 0; + unsigned ArrayIndex = 0; + + // pass information for memory segments provided by the original function + // interface + auto &FMRefs = WI->getInterfaceMemRefs(); + for (auto I = WI->getFunction()->getArgumentList().begin(), + IE = WI->getFunction()->getArgumentList().end(); I!= IE; ++I) { + WorkloadMemRef Query; + Query.Ptr = I; + auto MR = FMRefs.find(Query); + if ( MR!= FMRefs.end() ){ + writeMemAccessInfo(II, C, StructArray, + CI->getArgOperand(ArgOrder), + MR->Size, ArrayIndex++, + ArgOrder, MemAccessInfoAccessMode::ReadWrite); + } + ++ArgOrder; + } + + //pass information for global memory segments. + for (auto I = WI->getGlobalMemRefs().begin(), + IE = WI->getGlobalMemRefs().end(); I!= IE; ++I) { + writeMemAccessInfo(II, C, StructArray, + const_cast(I->Ptr), I->Size, ArrayIndex++, + ArgOrder++, MemAccessInfoAccessMode::ReadWrite); + } + + return StructArray; +} + +/// \brief This functions generates the code that performs th Workload +/// offloading. +/// +/// It does the following: +/// a) Allocates and sets the Memory Access Info Array +/// b) Enforces coherency by calling __hexe_enforce_coherency +/// c) Marshals the call arguments +/// d) Performs the offloading by calling __hexe_dispatch +/// e) Waits for its completion by calling __hexe_event_wait +/// f) Enforces coherency by calling __hexe_enforce_coherency +/// g) Reads the return value if any +Instruction * +WorkloadExtractUtil::marshalAndOffload(Instruction *II, CallInst *CI, + WorkloadInfo *WI) +{ + unsigned FID; + StructType *ST; + LLVMContext &C = HM->getContext(); + const DataLayout &DL = HM->getDataLayout(); + std::tie(std::ignore, FID, ST) = *HW->getMapping(CI->getCalledFunction()); + + Value *FIDV = ConstantInt::get(Type::getInt32Ty(C), FID); + + // a) Allocates and sets the Memory Access Info Array + Value *MemAccessInfo = + allocateAndSetMemAccessInfoArray(II, CI, WI, C, MemoryAccessInfoT); + + // b) Enforces coherency by calling __hexe_enforce_coherency + Value *MemAccNV = ConstantInt::get(Type::getInt32Ty(C), + WI->getInterfaceMemRefs().size() + + WI->getGlobalMemRefs().size()); + Value *CohArgs[] = { FIDV, MemAccessInfo, MemAccNV }; + CallInst::Create(HexeCoherencyCall, ArrayRef(CohArgs, 3), "", II); + + // c) Marshals the call arguments + Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) ); + AllocaInst *SA = new AllocaInst(ST, "offload_data", II); + prologMarshalling(II, SA, ST, WI, CI, C); + + // d) Performs the offloading by calling __hexe_dispatch + Value *CArgsP = new BitCastInst(SA, VoidPtr, "", II); + Value *CArgsSize = ConstantInt::get(Type::getInt32Ty(C), + DL.getTypeAllocSize(ST) ); + Value *Args[] = { FIDV, CArgsP, CArgsSize }; + + Value *Event = CallInst::Create(HexeDispatchCall, ArrayRef(Args, 3), + "hexe_offload", II); + + // e) Waits for its completion by calling __hexe_event_wait + CallInst::Create(HexeEventWaitCall, ArrayRef(Event), + "hexe_wait", II); + + // f) Enforces coherency by calling __hexe_enforce_coherency + CallInst::Create(HexeCoherencyCall, ArrayRef(CohArgs, 3), + "", II); + + // g) Reads the return value if any + return epilogueMarshalling(II, SA, ST, WI, CI, C); +} + +/// \brief It inserts a runtime call to __hexe_runtime_sched +/// function which controls at runtime if the Workload will +/// be executed on the CPU or the accelerator. +/// +/// Future plans (TODO): support runtime checks via this call. +/// We should provide access to the Memory Access Info Array +/// and Kernel Code Description. +Value *WorkloadExtractUtil::injectRuntimeSched(Instruction *II, + CallInst *CI, WorkloadInfo *WI) +{ + unsigned FID; + LLVMContext &C = HM->getContext(); + std::tie(std::ignore, FID, std::ignore) = *HW->getMapping( + CI->getCalledFunction()); + + //void __hexe_runtime_sched(unsigned id, MemoryAccessInfo *, + //unsigned num_elems, hexe_kernel_info__t *); + + Value *FIDV = ConstantInt::get(Type::getInt32Ty(C), FID); + Value *MemAccessP = + ConstantPointerNull::get(PointerType::getUnqual(MemoryAccessInfoT)); + Value *MemAccessNumV = ConstantInt::get(Type::getInt32Ty(C), 0); + Value *KernelInfoP = + ConstantPointerNull::get(PointerType::getUnqual(HexeKernelInfoT)); + + Value *Args[] = { FIDV, MemAccessP, MemAccessNumV, KernelInfoP }; + Value *D = CallInst::Create(HexeRuntimeSchedCall, ArrayRef(Args, 4), + "hexe_sched_decision", II); + + return CastInst::CreateIntegerCast(D, Type::getInt1Ty(C), false, + "hexe_sched_decision_cast", II); +} + +/// \brief This functions performs transformations on the Basic Block +/// that contains the function call we decided to offload. It also +/// injects the required Hexe runtime library Calls. +/// +/// We split the Basic Block at the function call site and we insert +/// a runtime scheduling call. Based on the value it returns, the +/// Workload will either be executed on the CPU or the accelerator. +/// We create two new Basic Blocks. The one performs the execution +/// on the CPU, the other performs the necessary actions in order +/// to offload to the accelerator. Finally we have a Converge Block +/// with a Phi Node for the function call return values. After the +/// PhiNode instructions we append the original instructions. +/// +/// Transformation Overview +/// BB before Fun Call +/// +------------------+ +/// | Instruction 0 | +/// | Instruction 1 | +/// | Instruction 2 | +/// Original Basic Block | Instruction 3 | +/// +------------------+ | | +/// | Instruction 0 | |Call Runtime_Sched| +/// | Instruction 1 | |SChed Branch | +/// | Instruction 2 | +----+------+------+ +/// | Instruction 3 | CPU Execution | | Offloading +/// | | Basic Block | | Basic Block +/// | .............. | |-------+ +------| +/// | | +----> +------------------+ +------------------+ +/// | CallInst @Func | | CallInst @Func | | MemAccessInfo set| +/// | | +------+-----------+ | Enforce Coherency| +/// | .............. | | | Call Data Marsh. | +/// | Instruction N | | | Dispatch Workload| +/// | Instruction N+1 | | | Wait Completion | +/// | .............. | | | Enforce Coherency| +/// | | | | Read Return Value| +/// | | | +-------------+----+ +/// | | | | +/// +------------------+ | | +/// | | +/// +----+Merge Basic Block +---+ +/// +------------------+ +/// |PhiNode(Ret. Val) | +/// | .............. | +/// | Instruction N | +/// | Instruction N+1 | +/// | .............. | +/// +------------------+ +/// +bool WorkloadExtractUtil::transformAndInjectGlueCode(WorkloadInfo *WI) +{ + TerminatorInst *ThenTerm; + TerminatorInst *ElseTerm; + CallInst *OCI = WI->getCall(); + + //insert runtime scheduling call + Value *SchedCond = injectRuntimeSched(OCI, OCI, WI); + + //split the Basic Block + SplitBlockAndInsertIfThenElse(SchedCond, OCI, &ThenTerm, &ElseTerm); + + //insert Code for offloading + Instruction *CallThen = marshalAndOffload(ThenTerm, OCI, WI); + + //insert Code for cpu execution + Instruction *CallElse = OCI->clone(); + CallElse->insertBefore(ElseTerm); + + //insert Phi Node for the return value + FunctionType *FT = OCI->getCalledFunction()->getFunctionType(); + if (!FT->getReturnType()->isVoidTy()) { + Instruction *InPos = OCI->getParent()->getFirstNonPHI(); + Type *PHIType = OCI->getCalledFunction()->getReturnType(); + + PHINode *PN = PHINode::Create(PHIType, 2, "hexe_converge", InPos); + PN->addIncoming(CallThen, ThenTerm->getParent()); + PN->addIncoming(CallElse, ElseTerm->getParent()); + + OCI->replaceAllUsesWith(PN); + } + + OCI->eraseFromParent(); + return true; +} + +/// \brief Annotates the Global Variable, Heap and Stack allocations +/// used by the Workload for replacement with equivalents that use +/// the Hexe library functions for memory allocation. +void WorkloadExtractUtil::annotateMemAllocationsForReplacement( + WorkloadInfo *WI) +{ + GlobalAllocations.insert(WI->getGlobalAllocations().begin(), + WI->getGlobalAllocations().end()); + MallocAllocations.insert(WI->getMallocAllocations().begin(), + WI->getMallocAllocations().end()); + AllocaAllocations.insert(WI->getAllocaAllocations().begin(), + WI->getAllocaAllocations().end()); +} + +bool WorkloadExtractUtil::extractWorkloadCode(WorkloadInfo *WI) +{ + WorkloadInfo *W2E; + WorkloadInfo Tmp; + DominatorTreeWrapperPass &DTW = + HP->getAnalysis(*(WI->getFunction())); + + DominatorTree *DT = &DTW.getDomTree(); + WA->setForOffloading(WI); + + //If the Workload is a Loop we need to transform it to a function + //first. + if (WI->isLoop()) { + loopToFunction(WI, &Tmp, DT); + W2E = &Tmp; + } else + W2E = WI; + + //annotate the Memory Allocations used by the Workload + //for replacement. + annotateMemAllocationsForReplacement(W2E); + + //Clone the function on the Hexe Workload Module. + cloneOrGetHexeFunction(W2E); + + //transform control flow and inject runtime calls. + transformAndInjectGlueCode(W2E); + + return true; +} + +/// \brief it replace a malloc function call with a call to hexe_malloc, +/// the malloc function equivalent of the Hexe runtime library. +static void replaceMalloc(CallInst *M, Function *HexeMalloc, Module *HM) +{ + LLVMContext &C = HM->getContext(); + //0 alignment set for now, + //meaning the implementation uses the + //standard alignment. + Value *Al = ConstantInt::get(Type::getInt32Ty(C), 0); + Value *Args[] = { M->getArgOperand(0), Al }; + CallInst *HMC = + CallInst::Create(HexeMalloc, ArrayRef(Args, 2), "", M); + + M->replaceAllUsesWith(HMC); + M->eraseFromParent(); +} + +/// \brief it replaces a free function call with a call to hexe_free, +/// the free function equivalent of the Hexe runtime library. +static void replaceFree(CallInst *F, Function *HexeFree, Module *HM) +{ + CallInst *HFC = + CallInst::Create(HexeFree, + ArrayRef(F->getArgOperand(0)), "", F); + + F->replaceAllUsesWith(HFC); + F->eraseFromParent(); +} + + +// >> Replace Global Variable +// +// key Goal: Replace Global Variables with equivalents that use +// memory provided by the Hexe runtime facilities instead of using +// the data segment. Make the minimal changes. +// +// How LLVM handles Global Variables: +// A Global Variable of type T is handled as Pointer T (T *) value. +// Whenever a function needs to access the value it performs load +// and store operations. +// +// The replacement strategy: +// +// For every Global Variable: +// +// a) We replace its type from T to T*, then this variable is handled +// as T ** value. +// +// b) We insert a load instruction for every user of the variable so +// the user receives a T * value as before. +// +// c) We generate Constructor and Destructor functions that allocate +// memory for the Global Variable via the Hexe memory management +// functions. +// + +// It keeps track of the Old and New Global Variables so +// we can build the constructor and destructor. +typedef std::tuple + CtorDctorT; + + +/// \brief +/// It transforms all the uses of a Constant Expression +/// For every ConstantExpr use it creates an actual instruction +/// that perform the exact operation. It insert a new +/// instruction just before every user of the ConstantExpr. +/// It updates GUsers by inserting those new instructions. +static void handleConstantExpr(ConstantExpr *CE, + std::queue &GUsers) +{ + std::set Users; + for (auto I = CE->use_begin(), IE = CE->use_end(); I!= IE; ++I) + Users.insert(I->getUser()); + + for (auto I = Users.begin(), IE = Users.end(); I!= IE; ++I) { + Instruction *Inst = dyn_cast(*I); + Instruction *CInst = CE->getAsInstruction(); + CInst->insertBefore(Inst); + Inst->replaceUsesOfWith(CE, CInst); + GUsers.push(CInst); + } +} + +/// \brief This function replaces a Global Variable of Type +/// T with one of Type T *. Then for every user of the original +/// variables a new load instruction is inserted so the user +/// uses again a Global Variable of Type T. +static void replaceGlobalVars(GlobalVariable *GV, Module *M, + std::vector &C) +{ + //insert new variable + PointerType *GVT = GV->getType(); + GlobalVariable *NGV = + new GlobalVariable(*M, GVT, GV->isConstant(), + GV->getLinkage(), ConstantPointerNull::get(GVT), + GV->getName()+"_hexe"); + NGV->copyAttributesFrom(GV); + + //handle ConstantExprs and update the Global Variable uses. + std::queue Users; + for (auto I = GV->use_begin(), IE = GV->use_end(); I!= IE; ++I) + Users.push(I->getUser()); + + while (Users.size()) { + Value *User = Users.front(); + Users.pop(); + Instruction *Inst; + + if (isa(User)) { + handleConstantExpr(dyn_cast(User), Users); + continue; + } else + Inst = dyn_cast(User); + + //insert the new load instruction + LoadInst *LI = new LoadInst(NGV, "", Inst); + Inst->replaceUsesOfWith(GV, LI); + } + + //keep track of the New and Old Global Variables + //so we can build the constructor and destructor later. + C.push_back( std::make_tuple(NGV, GV->getType(), GV) ); +} + +/// \brief This function builds a constructor for all the Global Variables +/// we replace. It allocates memory via the Hexe library memory management +/// functions and also initializes that memory based on the content +// of the original Global Variables. +static void createGlobalConstructor(const std::vector &G, + Module *M, Function *HexeMalloc) +{ + LLVMContext &C = M->getContext(); + const DataLayout &DL = M->getDataLayout(); + Type *Void = Type::getVoidTy(C); + Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) ); + Type *Int32 = Type::getInt32Ty(C); + Type *Int1 = Type::getInt1Ty(C); + + //get memcpy intrinsic declaration + Type *MemCpyType[] = { VoidPtr, VoidPtr, Int32 }; + Function *Mcopy = + Intrinsic::getDeclaration(M, Intrinsic::memcpy, + ArrayRef(MemCpyType, 3) ); + + //build constructor interface + FunctionType *FT = FunctionType::get(Void, false); + Function *F = + Function::Create(FT, GlobalValue::ExternalLinkage, + "hexe.constructor", M); + F->setCallingConv(CallingConv::C); + + BasicBlock *BB = BasicBlock::Create(C, "", F); + + for (auto I = G.begin(), IE = G.end(); I!= IE; ++I) { + GlobalVariable *GV; + Type *AT; + GlobalValue *In; + std::tie(GV, AT, In) = *I; + + //allocate memory with hexe_malloc + Value *Size = ConstantInt::get(Type::getInt32Ty(C), DL.getTypeAllocSize(AT)); + Value *Al = ConstantInt::get(Type::getInt32Ty(C), 0); + Value *Args[] = { Size, Al }; + CallInst *HMC = CallInst::Create(HexeMalloc, ArrayRef(Args, 2), + "", BB); + + //initialize by copying the original Global Variable content + Instruction *IC = new BitCastInst(In, VoidPtr, "", BB); + Value *Margs[] = { HMC, IC, + ConstantInt::get(Int32, DL.getTypeAllocSize(AT)), + ConstantInt::get(Int32, DL.getABITypeAlignment(AT)), + ConstantInt::get(Int1, 0) + }; + CallInst::Create(Mcopy, ArrayRef(Margs, 5), "", BB); + Instruction *TC = + new BitCastInst(HMC, GV->getType()->getPointerElementType(), "", BB); + new StoreInst(TC, GV, BB); + } + + ReturnInst::Create(C, BB); + + appendToGlobalCtors(*M, F, 65535); +} + +/// \brief This function builds a destructor for all the Global Variables +/// we replace. It releases their memory via the Hexe library memory +/// management functions. +static void createGlobalDestructor(const std::vector &G, + Module *M, Function *HexeFree) +{ + LLVMContext &C = M->getContext(); + Type *Void = Type::getVoidTy(C); + Type *VoidPtr = PointerType::getUnqual(Type::getInt8Ty(C)); + + //build destructor interface + FunctionType *FT = FunctionType::get(Void, false); + Function *F = Function::Create(FT, GlobalValue::ExternalLinkage, + "hexe.destructor", M); + F->setCallingConv(CallingConv::C); + + BasicBlock *BB = BasicBlock::Create(C, "", F); + + for (auto I = G.begin(), IE = G.end(); I!= IE; ++I) { + GlobalVariable *GV; + std::tie(GV, std::ignore, std::ignore) = *I; + + //calls hexe_free + Instruction *LI = new LoadInst(GV, "", BB); + Instruction *TC = CastInst::CreatePointerCast(LI, VoidPtr, "", BB); + CallInst::Create(HexeFree, ArrayRef(TC), "", BB); + } + + ReturnInst::Create(C, BB); + + appendToGlobalDtors(*M, F, 65535); +} + +/// \brief It detects the Basic Blocks of a Function +/// F that contain return instructions. +static void getFunctionReturnBlocks(Function *F, + std::set &RBlocks) +{ + for (auto I = F->begin(), IE = F->end(); I!= IE; ++I) { + if (!isa(I->getTerminator())) + continue; + RBlocks.insert(I); + } +} + +/// \brief +/// It gets the Descendant Basic Blocks of a Block by +/// traversing the CFG. The Descendant Blocks are stored +/// in DBlocks +static void getBasicBlockDescendants(BasicBlock *BB, + std::set &DBlocks) +{ + for (auto I : successors(BB)) { + DBlocks.insert(I); + getBasicBlockDescendants(I, DBlocks); + } +} + + +/// \brief This function replaces stack allocations +/// with Hexe memory allocation calls. +/// +/// We replace alloca instruction with hexe_malloc +/// calls. The tricky part here is that by replacing +/// a stack allocation with hexe_malloc we have to +/// explicitly release the memory when it is not in +/// use any more. +/// We get all the Basic Blocks of the function that have +/// a return terminator instruction. We then investigate +/// which of them are reachable from the Basic Block +/// where we replaced the alloca with hexe_malloc. +/// We insert a hexe_free in every return Basic Block +/// that is reachable. +static void replaceAlloca(AllocaInst *AI, Module *M, + Function *HexeMalloc, + Function *HexeFree) +{ + LLVMContext &C = M->getContext(); + const DataLayout &DL = M->getDataLayout(); + Type *Int32 = Type::getInt32Ty(C); + Function *HostF = AI->getParent()->getParent(); + BasicBlock *HostBB = AI->getParent(); + + //a) replace with hexe malloc + Value *Size = + ConstantInt::get(Int32, DL.getTypeAllocSize(AI->getAllocatedType())); + if (!Size->getType()->isIntegerTy(32)) + Size = CastInst::CreateIntegerCast(Size, Int32, false, "", AI); + + // get the allocation size + if (AI->isArrayAllocation()) { + Value *AElems = AI->getArraySize(); + if (!AElems->getType()->isIntegerTy(32)) + AElems = CastInst::CreateIntegerCast(AElems, Int32, false, "", AI); + Size = BinaryOperator::Create(Instruction::Mul, Size, AElems, "", AI); + } + + Value *Al = ConstantInt::get(Type::getInt32Ty(C), 0); + Value *Args[] = { Size, Al }; + CallInst *HMC = + CallInst::Create(HexeMalloc, ArrayRef(Args, 2), "", AI); + + Value *CMC; + if (AI->getType()!= HMC->getType()) + CMC = CastInst::CreatePointerCast(HMC, AI->getType(), "", AI); + else + CMC = HMC; + + //b) replace uses + AI->replaceAllUsesWith(CMC); + AI->eraseFromParent(); + + //c) insert hexe_free calls + //we replace allocas with hexe_mallocs which require + //calling hexe_free to release the allocated memory + + + //We get all the Basic Blocks of the function that have + //a return terminator instruction. We then investigate + //which of them are reachable from the Basic Block + //where we replaced the alloca with hexe_malloc. + //We insert a hexe_free in every return Basic Block + //that is reachable. + std::set RBlocks, DBlocks; + getFunctionReturnBlocks(HostF, RBlocks); + getBasicBlockDescendants(HostBB, DBlocks); + + for (auto I = RBlocks.begin(), IE = RBlocks.end(); I!= IE; ++I) { + if (DBlocks.count(*I)) + CallInst::Create(HexeFree, ArrayRef(HMC), + "", (*I)->getTerminator()); + } +} + +bool WorkloadExtractUtil::replaceMemAllocations() +{ + //replace malloc calls + for (auto I = MallocAllocations.begin(), + IE = MallocAllocations.end(); I!= IE; ++I) + replaceMalloc(*I, HexeMalloc, HM); + + //replace global variables + std::vector C; + for (auto I = GlobalAllocations.begin(), + IE = GlobalAllocations.end(); I!= IE; ++I) + replaceGlobalVars(*I, HM, C); + createGlobalConstructor(C, HM, HexeMalloc); + createGlobalDestructor(C, HM, HexeFree); + + //replace allocas + for (auto I = AllocaAllocations.begin(), IE = AllocaAllocations.end(); + I!= IE; ++I) + replaceAlloca(*I, HM, HexeMalloc, HexeFree); + + //replace all free calls + //Due to the limitations of alias analysis and pointer escaping, + //it is unfeasible to track the free call that releases a specific + //malloc allocation. For that reason we replace all the free calls + //with calls to hexe_free function. We then resolve at runtime if + //an allocation was served by the standard malloc or hexe_malloc. + std::vector FreeCalls; + TargetLibraryInfo *TLI = WA->getTLI(); + //function iterator + for (auto I = HM->begin(), IE = HM->end(); I!= IE; ++I) + //BasicBlock iterator + for (auto J = I->begin(), JE = I->end(); J!= JE; ++J) + // Instruction Iterator + for (auto K = J->begin(), KE = J->end(); K!= KE; ++K) + if (isFreeCall(K, TLI)) + FreeCalls.push_back(dyn_cast(K)); + + + for (auto I = FreeCalls.begin(), IE = FreeCalls.end(); I!= IE; ++I) + replaceFree(dyn_cast(*I), HexeFree, HM); + + return true; +} + +/// \brief This utility function either gets or inserts +/// a function declaration to a module. +static void insertOrGetFunctionGValue(StringRef FName, FunctionType *FT, + Module *M, Function * &F ) +{ + F = M->getFunction(FName); + if (!F) { + F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M); + F->setCallingConv(CallingConv::C); + } +} + + +/// \brief It adds the function declarations of the Hexe runtime +/// interface to the Module. It also defines the struct data types +/// used by these functions. +void WorkloadExtractUtil::setHexeRTFunctions() +{ + LLVMContext &C = HM->getContext(); + + StructType *EventT = StructType::create(C, "hexe_event_t"); + PointerType *EventTP = PointerType::getUnqual(EventT); + Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) ); + Type *Int32 = Type::getInt32Ty(C); + Type *Void = Type::getVoidTy(C); + + //struct MemAccessInfo{ + // void *ptr; + // unsigned size; + // unsigned arg_order; + // access_mode; // read 0, write 1, readwrite 3; + // }; + + Type *MemAccessInfoFields[] = { + VoidPtr, Int32, Int32, + Type::getInt8Ty(C) + }; + StructType *MAST = + StructType::get(C, ArrayRef(MemAccessInfoFields, 4), + "hexe_mem_access_info_t"); + Type *MASTP = PointerType::getUnqual(MAST); + + // Hexe Kernel Info Type + StructType *KernelInfoT = StructType::create(C, "hexe_kernel_info__t"); + PointerType *KernelInfoTP = PointerType::getUnqual(KernelInfoT); + + //int __hexe_runtime_sched(unsigned function_id, MemoryAccessInfo *, + //unsigned num_elems, hexe_kernel_info__t *); + + Type *HRSArgs[] = { Int32, MASTP, Int32, KernelInfoTP }; + FunctionType *HRSFT = + FunctionType::get(Int32, ArrayRef(HRSArgs, 4), false); + insertOrGetFunctionGValue("__hexe_runtime_sched", HRSFT, + HM, HexeRuntimeSchedCall); + + //hexe_event_t *__hexe_dispatch(int function_id, void *args, int args_size); + Type *HDFArgs[] = { Int32, VoidPtr, Int32 }; + FunctionType *HDFT = + FunctionType::get(EventTP, ArrayRef(HDFArgs, 3), false); + insertOrGetFunctionGValue("__hexe_dispatch", HDFT, HM, HexeDispatchCall); + + //void __hexe_enforce_coherency(int functionid, MemoryAccessInfo *, + //unsigned num_elems); + Type *HECArgs[] = { Int32, MASTP, Int32 }; + FunctionType *HECFT = + FunctionType::get(Void, ArrayRef(HECArgs, 3), false); + insertOrGetFunctionGValue("__hexe_enforce_coherency", HECFT, + HM, HexeCoherencyCall); + + //int __hexe_event_wait(hexe_event_t *event); + Type *HHWArgs[] = { EventTP }; + FunctionType *HHWFT = + FunctionType::get(Int32, ArrayRef(HHWArgs, 1), false); + insertOrGetFunctionGValue("__hexe_event_wait", HHWFT, + HM, HexeEventWaitCall); + + //void *__hexe_malloc(unsigned size, unsigned alignment); + Type *HMArgs[] = { Int32, Int32 }; + FunctionType *HMFT = + FunctionType::get(VoidPtr, ArrayRef(HMArgs, 2), false); + insertOrGetFunctionGValue("__hexe_malloc", HMFT, HM, HexeMalloc); + + //void __hexe_free(void *ptr); + Type *HFArgs[] = { VoidPtr }; + FunctionType *HFFT = + FunctionType::get(Void, ArrayRef(HFArgs, 1), false); + insertOrGetFunctionGValue("__hexe_free", HFFT, HM, HexeFree); + + HexeEventT = EventT; + MemoryAccessInfoT = MAST; + HexeKernelInfoT = KernelInfoT; +} + +char WorkloadExtractor::ID = 1;//just because everyone sets it to 0 +INITIALIZE_PASS_BEGIN(WorkloadExtractor, + "hexe-extract", "Hexe Workload Extractor", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominanceFrontier) +INITIALIZE_PASS_DEPENDENCY(WorkloadAnalysis) +INITIALIZE_PASS_END(WorkloadExtractor, + "hexe-extract", "Hexe Workload Extractor", false, false) + +ModulePass *createWorkloadExtractorPass() +{ + return new WorkloadExtractor(); +} + + +WorkloadExtractor::WorkloadExtractor():ModulePass(ID), + WA(nullptr), HW(nullptr), WEU(nullptr) +{ + initializeWorkloadExtractorPass(*PassRegistry::getPassRegistry()); +} + +void WorkloadExtractor::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); +} + +bool WorkloadExtractor::runOnModule(Module &M) +{ + LLVMContext &C = M.getContext(); + WA = &getAnalysis(); + HW = new HexeWorkload(C); + + WEU = new WorkloadExtractUtil(WA, &M, this, true); + WEU->registerHexeWorkload(HW); + + if (HexeFunctionCalls) { //compiler option defined in Hexe.cpp + //enable for offloading all the Function Call Workloads + + std::map &WDs = WA->getCallWorkloads(); + for (auto I = WDs.begin(), IE = WDs.end(); I!= IE; ++I) + WEU->extractWorkloadCode(I->second); + } + + if (HexeLoops) { //compiler option define in Hexe.cpp + //enable for offloading Function Loop Workloads. + + std::map &WDs = WA->getLoopWorkloads(); + while (WDs.size()) { + Loop *L; + WorkloadInfo *WI; + std::tie(L, WI) = *(WDs.begin()); + WEU->extractWorkloadCode(WI); + } + } + + //TODO EXTEND TO CONSIDER SPECIFIC HEXE POLICIES + + //replace the memory allocations used by workloads + //that have been transformed for offloading. + WEU->replaceMemAllocations(); + + //make sure that the memory intrinsics used + //in the Hexe Workload Module have been + //declared properly. + HW->validateSupportedFunctionCalls(); + + //set the host DataLayout and Triple + //to the Hexe Workload + HW->setDLandTriple(M.getDataLayout(), + Triple(M.getTargetTriple())); + + //write Hexe Workload to file + //filename defined by the HexeWorkloadFName + //compiler option, defined in Hexe.cpp + HW->writeFile(HexeWorkloadFName); + return true; +} + +void WorkloadExtractor::releaseMemory() +{ + delete HW; + delete WEU; +} Index: lib/Transforms/Hexe/WorkloadTransform.cpp =================================================================== --- /dev/null +++ lib/Transforms/Hexe/WorkloadTransform.cpp @@ -0,0 +1,107 @@ +// ===- WorkloadTransform.cpp - Heterogeneous Execution Engine -*- C++ -*-=== // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// \file +/// Implementation of the Workload Transform Pass of the Heterogeneous +/// Executon Engine. Please read the header file documentation for high +/// level description. +// ===--------------------------------------------------------------------=== // + + +#include "llvm/Transforms/Hexe/WorkloadTransform.h" +#include "llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h" +#include "llvm/Transforms/Hexe/Hexe.h" +#include "llvm/Transforms/Hexe/InitializeHexePasses.h" +#include "llvm/Transforms/Hexe/Utils.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Host.h" + +#define DEBUG_TYPE "hwt" + +using namespace llvm; + +char WorkloadTransform::ID = 1;//just because everyone sets it to 0 +INITIALIZE_PASS_BEGIN(WorkloadTransform, "hexe-transform", + "Hexe Workload Transform", false, false) +INITIALIZE_PASS_END(WorkloadTransform, "hexe-transform", + "Hexe Workload Transform", false, false) + + +ModulePass *createWorkloadTransformPass() +{ + return new WorkloadTransform(); +} + + +WorkloadTransform::WorkloadTransform():ModulePass(ID) +{ + initializeWorkloadTransformPass(*PassRegistry::getPassRegistry()); +} + +void WorkloadTransform::getAnalysisUsage(AnalysisUsage &AU) const +{ + //empty for now.... +} + +bool WorkloadTransform::runOnModule(Module &M) +{ + LLVMContext &C = M.getContext(); + + //get the Accelerator Target Triple + std::string AccelTripleStr = sys::getDefaultTargetTriple(); + Triple AccelTriple(AccelTripleStr); + + Triple HostTriple; //it reads the Host Triple from the Hexe Metadata. + + //read Hexe Metadata + readHexeMetadata(&M, functionNum, FL, HostTriple); + + //Create an instance of the requested Adaptor + //HexeAdaptor is a compiler flag defined in Hexe.cpp + HexeWorkloadAdaptor *HWT = + StringSwitch(HexeAdaptor) + .Case("hexagon", createHexagonWorkloadAdaptor()) + .Default(nullptr); + + if (!HWT) + C.emitError("The Accelerator Adaptor is not supported."); + + //checks if the Adaptor supports IR transformations + //for the specific combination of Host and Accelerator Triples + //This check can be skipped by setting the HexeAdaptorCheck flag + //to false + if (HexeAdaptorCheck && !HWT->isSupported(HostTriple, AccelTriple)) { + std::string M = "Host: "; + M += HostTriple.str(); + M += " Accelerator: "; + M += AccelTriple.str(); + M += " not supported."; + C.emitError(M); + } + + //performs the transformations + HWT->transform(&M, HostTriple, AccelTriple, FL); + + //deletes the adaptor instance + delete HWT; + + //remove Hexe related Metadata from the Module + eraseHexeMetadata(&M); + + return true; +} + +void WorkloadTransform::releaseMemory() +{ + FL.clear(); +} + Index: test/Transforms/Hexe/analysistest1.ll =================================================================== --- /dev/null +++ test/Transforms/Hexe/analysistest1.ll @@ -0,0 +1,599 @@ +; RUN: opt -S -hexe-analysis -analyze < %s | FileCheck %s + +; Code and Memory Reference Checks, Functions + +; CHECK: add: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: add_g: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: G + +; CHECK: add_p: +; CHECK-NEXT: InterfaceMemRefs: x y +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: add_pg: +; CHECK-NEXT: InterfaceMemRefs: x +; CHECK-NEXT: GlobalMemRefs: G + +; CHECK: comp: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: comp_g: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: G + +; CHECK: comp_p: +; CHECK-NEXT: InterfaceMemRefs: x y +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: comp_pg: +; CHECK-NEXT: InterfaceMemRefs: x +; CHECK-NEXT: GlobalMemRefs: G + + +; Code and Memory Reference Checks, Loops + +; CHECK: for.body, Parent Function: comp: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: for.body, Parent Function: comp_g: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: for.body, Parent Function: comp_p: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: + +; CHECK: for.body, Parent Function: comp_pg: +; CHECK-NEXT: InterfaceMemRefs: +; CHECK-NEXT: GlobalMemRefs: + + +; Memory Allocation Use Checks, Function Calls + +; CHECK: call, Callee: add Caller: call_point: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call, Callee: add Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call1, Callee: comp Caller: call_point: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call1, Callee: comp Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call10, Callee: comp_p Caller: call_point: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: xb yb + +; CHECK: call12, Callee: comp_pg Caller: call_point: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: xb + +; CHECK: call4, Callee: add_p Caller: call_point: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: call2 call3 +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call4, Callee: add_p Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: call2 call3 +; CHECK-NEXT:AllocaAllocations: + +; CHECK: call5, Callee: add_g Caller: call_point: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call5, Callee: add_g Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call6, Callee: add_pg Caller: call_point: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: call2 +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call6, Callee: add_pg Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: call2 +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call7, Callee: comp_p Caller: call_point: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: call2 call3 +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call7, Callee: comp_p Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: call2 call3 +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call8, Callee: comp_g Caller: call_point: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call8, Callee: comp_g Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call9, Callee: comp_pg Caller: call_point: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: call2 +; CHECK-NEXT: AllocaAllocations: + +; CHECK: call9, Callee: comp_pg Caller: call_point2: +; CHECK-NEXT: GlobalAllocations: G +; CHECK-NEXT: MallocAllocations: call2 +; CHECK-NEXT: AllocaAllocations: + + +; Memory Allocation Use Checks, Loops + +; CHECK: for.body, Parent Function: comp: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: for.body, Parent Function: comp_g: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: for.body, Parent Function: comp_p: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + +; CHECK: for.body, Parent Function: comp_pg: +; CHECK-NEXT: GlobalAllocations: +; CHECK-NEXT: MallocAllocations: +; CHECK-NEXT: AllocaAllocations: + + +;#include +;#include +; +;//scalar arguments +;int add(int x, int y) +;{ +; return x+y; +;} +; +;//sclar arguments, loop +;int comp(int x, int y, int lstep) +;{ +; int value=0; +; int i; +; +; for(i=0; i +; +;const unsigned N=100; +;char global[N]; +;int globali[N]; +; +;void funny_memcpy(char *dst, char *src, size_t N) +;{ +; int i; +; +; for(i=0; i* null, i32 0, %hexe_kernel_info__t* null) +; CHECK: %hexe_sched_decision_cast = trunc i32 %hexe_sched_decision to i1 +; CHECK: call void @__hexe_enforce_coherency(i32 0, <{ i8*, i32, i32, i8 }>* %5, i32 1) +; CHECK: %hexe_offload = call %hexe_event_t* @__hexe_dispatch(i32 0, i8* %16, i32 16) +; CHECK: %hexe_wait = call i32 @__hexe_event_wait(%hexe_event_t* %hexe_offload) +; CHECK: call void @__hexe_enforce_coherency(i32 0, <{ i8*, i32, i32, i8 }>* %5, i32 1) + + +; check control flow changes and runtime calls + +; CHECK: %hexe_sched_decision1 = call i32 @__hexe_runtime_sched(i32 1, <{ i8*, i32, i32, i8 }>* null, i32 0, %hexe_kernel_info__t* null) +; CHECK: %hexe_offload4 = call %hexe_event_t* @__hexe_dispatch(i32 1, i8* %60, i32 28) +; CHECK: call void @__hexe_enforce_coherency(i32 1, <{ i8*, i32, i32, i8 }>* %29, i32 4) +; CHECK: %hexe_offload10 = call %hexe_event_t* @__hexe_dispatch(i32 1, i8* %98, i32 28) + +; CHECK: define void @hexe.constructor() { +; CHECK: define void @hexe.destructor() { + + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" +target triple = "hexeextracttest--linux-gnueabi" + +@G = global float 2.000000e+00, align 4 +@G2 = global i32 10, align 4 + +; Function Attrs: noinline nounwind readonly +define i32 @add(i32 %x, i32 %y) #0 { +entry: + %add = add nsw i32 %y, %x + %0 = load i32, i32* @G2, align 4, !tbaa !3 + %add1 = add nsw i32 %add, %0 + ret i32 %add1 +} + +; Function Attrs: noinline nounwind readonly +define i32 @addpf(i32 %x, i32 %y, i32* nocapture readonly %i, i32* nocapture readonly %j) #0 { +entry: + %add = add nsw i32 %y, %x + %0 = load float, float* @G, align 4, !tbaa !7 + %conv = fptosi float %0 to i32 + %add1 = add nsw i32 %add, %conv + %1 = load i32, i32* %i, align 4, !tbaa !3 + %add2 = add nsw i32 %add1, %1 + %2 = load i32, i32* %j, align 4, !tbaa !3 + %add3 = add nsw i32 %add2, %2 + %3 = load i32, i32* @G2, align 4, !tbaa !3 + %add4 = add nsw i32 %add3, %3 + ret i32 %add4 +} + +; Function Attrs: noinline nounwind +define i32 @call_point() #1 { +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %call = tail call i32 @add(i32 10, i32 20) + store i32 40, i32* %a, align 4, !tbaa !3 + store i32 50, i32* %b, align 4, !tbaa !3 + %call1 = tail call noalias i8* @malloc(i32 4) #3 + %0 = bitcast i8* %call1 to i32* + %call2 = tail call noalias i8* @malloc(i32 4) #3 + %1 = bitcast i8* %call2 to i32* + store i32 100, i32* %0, align 4, !tbaa !3 + store i32 110, i32* %1, align 4, !tbaa !3 + %2 = load float, float* @G, align 4, !tbaa !7 + %conv = fptosi float %2 to i32 + %call3 = call i32 @addpf(i32 %conv, i32 100, i32* %a, i32* %b) + %call4 = tail call i32 @addpf(i32 10, i32 100, i32* %0, i32* %1) + tail call void @free(i8* %call1) #3 + tail call void @free(i8* %call2) #3 + %add = add nsw i32 %call3, %call + %add5 = add nsw i32 %add, %call4 + ret i32 %add5 +} + +; Function Attrs: nounwind +declare noalias i8* @malloc(i32) #2 + +; Function Attrs: nounwind +declare void @free(i8* nocapture) #2 + +attributes #0 = { noinline nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"Snapdragon LLVM ARM Compiler 3.5 (based on LLVM 3.7.0)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"float", !5, i64 0} Index: test/Transforms/Hexe/transformtest1.ll =================================================================== --- /dev/null +++ test/Transforms/Hexe/transformtest1.ll @@ -0,0 +1,70 @@ +; RUN: opt -S -hexe-transform -hexe-adaptor=hexagon -hexe-adaptor-check=false < %s | FileCheck %s + +;check function types + +;CHECK: type { i8*, i32 } +;CHECK: type { i32, i32, i32*, i32 } +;CHECK: type { i32, i32, i32*, i32*, float*, i32*, i32 } + +;check generated function interface + +;CHECK: define internal i32 @add_compact(%0*) { +;CHECK-NEXT: marshallingblock: + +;CHECK: define internal i32 @addpf_compact(%0*) { +;CHECK-NEXT: marshallingblock: + +; check skel_invoke +;CHECK: define i32 @__hexe_skel_invoke(i32, %0*) { +;CHECK: switch i32 %4, label %5 [ +;CHECK: i32 0, label %6 +;CHECK: i32 1, label %8 + +;CHECK: %7 = call i32 @add_compact(%0* %1) +;CHECK: %9 = call i32 @addpf_compact(%0* %1) + + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" +target triple = "hexe-unknown--unknown" + +; Function Attrs: noinline nounwind readonly +define available_externally i32 @add(i32, i32, i32*) #0 { +entry: + %add = add nsw i32 %1, %0 + %3 = load i32, i32* %2, align 4, !tbaa !4 + %add1 = add nsw i32 %add, %3 + ret i32 %add1 +} + +; Function Attrs: noinline nounwind readonly +define available_externally i32 @addpf(i32, i32, i32* nocapture readonly, i32* nocapture readonly, float*, i32*) #0 { +entry: + %add = add nsw i32 %1, %0 + %6 = load float, float* %4, align 4, !tbaa !8 + %conv = fptosi float %6 to i32 + %add1 = add nsw i32 %add, %conv + %7 = load i32, i32* %2, align 4, !tbaa !4 + %add2 = add nsw i32 %add1, %7 + %8 = load i32, i32* %3, align 4, !tbaa !4 + %add3 = add nsw i32 %add2, %8 + %9 = load i32, i32* %5, align 4, !tbaa !4 + %add4 = add nsw i32 %add3, %9 + ret i32 %add4 +} + +attributes #0 = { noinline nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!hexe.info = !{!0} +!hexe.host_triple = !{!1} +!hexe.function_list = !{!2, !3} + +!0 = !{i32 2} +!1 = !{!"hexeextracttest--linux-gnueabi"} +!2 = !{i32 (i32, i32, i32*)* @add, i32 0} +!3 = !{i32 (i32, i32, i32*, i32*, float*, i32*)* @addpf, i32 1} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9, !9, i64 0} +!9 = !{!"float", !6, i64 0} Index: tools/opt/CMakeLists.txt =================================================================== --- tools/opt/CMakeLists.txt +++ tools/opt/CMakeLists.txt @@ -15,6 +15,7 @@ Support Target TransformUtils + Hexe Vectorize Passes ) Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -317,6 +317,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeHexe(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. initializeCodeGenPreparePass(Registry);