Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -249,7 +249,7 @@
 void initializeScalarEvolutionPass(PassRegistry&);
 void initializeShrinkWrapPass(PassRegistry &);
 void initializeSimpleInlinerPass(PassRegistry&);
-void initializeShadowStackGCLoweringPass(PassRegistry&);  
+void initializeShadowStackGCLoweringPass(PassRegistry&);
 void initializeRegisterCoalescerPass(PassRegistry&);
 void initializeSingleLoopExtractorPass(PassRegistry&);
 void initializeSinkingPass(PassRegistry&);
@@ -300,6 +300,10 @@
 void initializeDwarfEHPreparePass(PassRegistry&);
 void initializeFloat2IntPass(PassRegistry&);
 void initializeLoopDistributePass(PassRegistry&);
+void initializeHexe(PassRegistry &);
+void initializeWorkloadAnalysisPass(PassRegistry &);
+void initializeWorkloadExtractorPass(PassRegistry &);
+void initializeWorkloadTransformPass(PassRegistry &);
 }
 
 #endif
Index: include/llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h
@@ -0,0 +1,58 @@
+//===--- Transforms/Hexe/HeterogeneousAdaptors/Adaptor.h - Hexe --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// ===------- High-Level Description of the Hexe Adaptor Interface  -------===
+/// This file provides the Hexe Workload Adaptor interface. Implementations of
+/// this interface are responsible for transforming a Hexe Workload to a
+/// particular set of conventions required by an accelerator and its programming
+/// environment. This file also includes function declarations for the
+/// initialization of the available plugins.
+//===---------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TRANSFORMS_HEXE_HETEROGENEOUSADAPTORS_H
+#define LLVM_TRANSFORMS_HEXE_HETEROGENEOUSADAPTORS_H
+
+#include <vector>
+#include <tuple>
+
+namespace llvm {
+
+class Function;
+class Module;
+class Triple;
+
+typedef std::tuple<Function *, unsigned> HexeFunctionInfoT;
+typedef std::vector<HexeFunctionInfoT> HexeFunctionInfoListT;
+
+class HexeWorkloadAdaptor {
+public:
+  HexeWorkloadAdaptor() {};
+  virtual ~HexeWorkloadAdaptor() {};
+
+  /// \brief It performs the required transformations.
+  /// After the completion of this, the Module should comply
+  /// to the conventions of the Adaptor.
+  virtual void transform(Module *Module,
+                         const Triple &HostTriple, const Triple &AccelTriple,
+                         const HexeFunctionInfoListT  &FunctionList) = 0;
+
+  /// It checks if a particular combination of Host and Accelerator Triples
+  /// is supported by the adaptor.
+  virtual bool isSupported(const Triple &HostTriple,
+                           const Triple &AccelTriple) = 0;
+};
+
+
+/// \brief It creates and returns an instrance of the Hexagon Adaptor.
+HexeWorkloadAdaptor *createHexagonWorkloadAdaptor();
+
+}
+
+#endif
Index: include/llvm/Transforms/Hexe/Hexe.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/Hexe.h
@@ -0,0 +1,64 @@
+//===-- Transforms/Hexe/Hexe.h - Heterogeneous Execution Engine -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This header file provides function prototypes for the instantiation of Hexe
+/// Passes. It also declares the command line flags that control the Hexe
+/// operations.
+//===---------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_HEXE_HEXE_H
+#define LLVM_TRANSFORMS_HEXE_HEXE_H
+
+#include "llvm/Support/CommandLine.h"
+#include <string>
+
+namespace llvm {
+
+class ModulePass;
+
+
+//===------------------------------------------------------------------===//
+//
+// Workload Analysis Pass - This pass analyzes workloads for offloading to
+// DSPs/GPUs
+//
+
+ModulePass *createWorkloadAnalysisPass();
+
+
+
+//===------------------------------------------------------------------===//
+//
+// Workload Extractor Pass - This pass extracts workloads for offloading to
+// DSPs/GPUs
+//
+
+ModulePass *createWorkloadExtractorPass();
+
+
+//===------------------------------------------------------------------===//
+//
+// Workload Extractor Pass - This pass extracts workloads for offloading to
+// DSPs/GPUs
+//
+
+FunctionPass *createWorkloadTransformPass();
+
+
+//===------------------------------------------------------------------===//
+//===------------------------------------------------------------------===//
+//HEXE FLAGS
+extern cl::opt<bool> HexeFunctionCalls;
+extern cl::opt<bool> HexeLoops;
+extern cl::opt<std::string> HexePolicy;
+extern cl::opt<std::string> HexeWorkloadFName;
+extern cl::opt<std::string> HexeAdaptor;
+extern cl::opt<bool> HexeAdaptorCheck;
+}
+
+#endif //LLVM_TRANSFORMS_HEXE_HEXE_H
Index: include/llvm/Transforms/Hexe/InitializeHexePasses.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/InitializeHexePasses.h
@@ -0,0 +1,28 @@
+//===------- Transforms/Hexe/InitializeHexePasses.h - Hexe -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This header file provides function prototypes for the initialization of
+/// the Heterogeneous Execution Engine Passes.
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_HEXE_INITIALIZEHEXEPASSES_H
+#define LLVM_TRANSFORMS_HEXE_INITIALIZEHEXEPASSES_H
+
+namespace llvm {
+class PassRegistry;
+
+/// \file initializeHexe - Initialize all passes linked into
+/// Hexe library.
+void initializeHexe(PassRegistry &);
+void initializeWorkloadAnalysisPass(PassRegistry &);
+void initializeWorkloadExtractorPass(PassRegistry &);
+void initializeWorkloadTransformPass(PassRegistry &);
+}
+
+#endif //LLVM_TRANSFORMS_HEXE_INITIALIZEHEXEPASSES_H
Index: include/llvm/Transforms/Hexe/Utils.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/Utils.h
@@ -0,0 +1,68 @@
+//===-- Transforms/Hexe/Utils.h - Heterogeneous Execution Engine -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This header file provides prototypes for utility functions that
+/// a)create Struct Types based on function interfaces.
+/// b)Hexe Metadata read and write functions.
+/// c)erase Hexe Metadata from a Module
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_HEXE_UTIL_H
+#define LLVM_TRANSFORMS_HEXE_UTIL_H
+
+#include <map>
+#include <tuple>
+#include <vector>
+
+namespace llvm {
+
+class Function;
+class FunctionType;
+class LLVMContext;
+class Module;
+class StructType;
+class Triple;
+
+typedef std::tuple<Function *, unsigned> HexeFunctionInfoT;
+typedef std::tuple< Function *, unsigned, StructType *> HexeFunctionStructInfoT;
+typedef std::vector<HexeFunctionInfoT> HexeFunctionInfoListT;
+typedef std::map<Function *, HexeFunctionStructInfoT> FunctionHexeFunctionMapT;
+
+/// \brief getCompactFunctionStruct builds a Struct Type based on the
+/// function interface of a function. The interface is given as the
+/// Function Type FT.
+//
+/// \returns the Struct Type.
+StructType *getCompactFunctionStruct(FunctionType *FT, LLVMContext &C);
+
+/// \brief readHexeMetadata reads the Hexe metadata from a Module.
+///
+/// \param NumHexeFunctions: the number of Hexe functions included
+/// in the module.
+/// \param FunctionList: the list of functions.
+/// \param HostTriple: the triple of the host platform.
+void readHexeMetadata(Module *M, unsigned &NumHexeFunctions,
+                      HexeFunctionInfoListT &FunctionList, Triple &HostTriple);
+
+/// \brief writeHexeMetadata writes the Hexe metadata to a Module.
+///
+/// \param NumHexeFunctions: the number of Hexe functions included
+/// in the module.
+/// \param FunctionMap: the function map.
+/// \param HostTriple: the triple of the host platform.
+void writeHexeMetadata(Module *M, unsigned NumHexeFunctions,
+                       const FunctionHexeFunctionMapT &FunctionMap,
+                       const Triple &HostTriple);
+
+/// \brief erase Hexe Metadata from Module
+void eraseHexeMetadata(Module *M);
+
+}
+
+#endif //LLVM_TRANSFORMS_HEXE_UTIL_H
Index: include/llvm/Transforms/Hexe/WorkloadAnalysis.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/WorkloadAnalysis.h
@@ -0,0 +1,380 @@
+//===---------- Transforms/Hexe/WorkloadAnalysis.h - Hexe -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// ===----------- High-Level Description of the Workload Analysis ----------===
+/// Key Goal: Design and implement an analysis pass that can reason if a loop
+/// or a function call is eligible for offloading to an accelerator. If a loop
+/// or function call is considered eligible for offloading, the analysis
+/// generates a Workload Information handler that can be used by the offloading
+/// transformations.
+///
+/// Key points:
+/// Architecture constrains:
+///      a) Memory Coherency
+///              The main processor and the accelerator may share a joint or
+///              disjoint memory with full, limited or no coherency support.
+///              In case of systems where coherency is supported, it is also
+///              proven that explicit communication and synchronization leads
+///              to higher performance. It is important to be able to reason
+///              about the memory a loop or function call accesses. Our approach
+///              follows the assumption that the target system does not have
+///              hardware support for coherency and runtime/driver operations
+///              are required. However this can be relaxed to "take advantage"
+///              of coherent systems.
+///      b) Data Layout
+///              A function or loop may access complex data types containing
+///              scalars and pointers referring to arbitrary addresses in memory.
+///              We constrain our analysis criteria to consider eligible for
+///              offloading only loops and functions that access data types
+///              which do not have nested pointer types. Any combination of
+///              scalars, structs and arrays is supported. Again, this could
+///              be relaxed for coherent systems.
+///      c) Atomic, Specialized Instruction/Operations
+///              We do not consider as eligible for offloading code that
+///              contains atomics, and specialized instructions expressed
+///              as intrinsics. The reason is that the main processor and
+///              accelerator  architectures may vary significantly in their
+///              capabilities. Again, analysis specialization for particular
+///              systems could be considered.
+///
+///
+/// Hexe Approach:
+///     The analysis operates in two steps: (1) Code and Memory Reference
+///     Analysis for Functions and Loops and (2) Memory Allocation Use
+///     Analysis.
+///
+///      1) Code and Memory Reference Analysis for Functions and Loops:
+///             A) We analyze all the instructions of the Function and Loop
+///             codes to decide if we support them for offloading.
+///
+///             B) We then analyze the memory references performed by the
+///             same code by analyzing the load and store instructions.
+///
+///                     a) In case of Functions:
+///                     The load and store instructions should either refer
+///                     to memory segments available as Global Variables or
+///                     memory segments provided by the function interface.
+///
+///                     b) In case of Loops:
+///                     The load and store instructions should either refer
+///                     to memory segments available as Global Variables or
+///                     to memory segments provided by their parent function
+///                     interface.
+///
+///      2) Memory Allocation Use Analysis
+///      At this point we proceed our analysis only on functions and loops
+///      that successfully passed the first step. The idea here is to
+///      map the memory references we detected in the previous
+///      step to actual Memory Allocations. We consider three type of
+///      Allocations so far: a) Global Variables, b) Dynamic Allocations
+///      (malloc) and c) Stack Allocations (alloca).
+///
+///      Both Functions and Loops have memory references that either
+///      are Global Variables or they are provided by the Function Interface.
+///
+///             A) Global Variable References.
+///             We already know that they are Global Variables and we know
+///             their allocation, so there is nothing to be done here.
+///
+///             B) Function Interface References (The interesting case).
+///             The origin of the memory references is determined at the
+///             call sites of the function. Each time a function is called
+///             those references may be mapped to any type of Memory
+///             Allocation and we need to analyze them. That is the reason
+///             we consider Function Calls for offloading and not just
+///             Function definitions.
+///
+///             IMPORTANT NOTE:
+///             A Loop is considered eligible for offloading only if all the
+///             calls of its parent function can be successfully analyzed.
+///             We apply this limitation because we want to avoid the need
+///             for generating multiple code versions for the function
+///             that hosts the loop. Generating multiple versions would be
+///             the case if we have a situation where some call sites of the
+///             parent function can be analyzed and some others not.
+///
+///
+/// FUTURE PLANS (TODO):
+///     A)Extend the analysis API to support user given information
+///     about eligible code for offloading. This can be useful in the
+///     following cases:
+///             a) Compiling code with special pragmas or attributes.
+///             b) Compiling Functional and Domain Specific Languages.
+///             c) Exploit runtime information in case of combining
+///             Hexe with MCJIT.
+///     B)Enable a relaxed analysis mode for fully coherent systems
+//===-----------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_HEXE_WORKLOADANALYSIS_H
+#define LLVM_TRANSFORMS_HEXE_WORKLOADANALYSIS_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Pass.h"
+#include <set>
+#include <vector>
+
+namespace llvm {
+
+class Function;
+class Loop;
+class Value;
+class Type;
+class LoopInfo;
+class DominatorTree;
+class CallGraph;
+class TargetLibraryInfo;
+class WorkloadAnalysis;
+
+/// \brief This enum class defines the available Workload Types
+/// that are supported by the Workload Analysis.
+///
+/// Available Workload Types:
+///     Function,
+///     Call, a Function Call
+///     Loop
+/// There are two variations of the above:
+///     NoGlobals: the workload does not access global variables
+///     WithGlobals: the workload accesses global variables
+enum class WorkloadType {
+  FunctionNoGlobals,
+  FunctionWithGlobals,
+  CallNoGlobals,
+  CallWithGlobals,
+  LoopNoGlobals,
+  LoopWithGlobals
+};
+
+/// \brief It represents a Memory Reference
+///
+/// This is a typedef of the AliasAnalysis::Location
+typedef AliasAnalysis::Location WorkloadMemRef;
+
+
+/// \brief This Comparator orders Workload Memory References
+/// by their pointer Value.
+struct WorkloadMemRefComparator {
+  bool operator() (const WorkloadMemRef &a, const WorkloadMemRef &b) const
+  { return a.Ptr<b.Ptr; }
+};
+
+typedef std::set<WorkloadMemRef, WorkloadMemRefComparator>  WorkloadMemRefSet;
+
+/// \brief Objects of this class represent Workloads that
+/// we consider for offloading. Those workloads can either
+/// be Function Calls and Loops.
+///
+/// Workloads of Function type are also supported but they
+/// are exclusively used by the analysis internals.
+///
+/// The reason we use this class design instead of using the
+/// llvm virtual class style is that a Workload object gets
+/// transformed across the different stages of the Workload
+/// Analysis and it is more efficient to be able to mutate
+/// the same object.
+class WorkloadInfo {
+public:
+  WorkloadInfo() {}
+
+  /// \returns true if the Workload is a Function.
+  bool isFunction() const {
+    return ((type == WorkloadType::FunctionNoGlobals) ||
+            (type == WorkloadType::FunctionWithGlobals));
+  }
+
+  /// \returns true if the Workload is a Function Call.
+  bool isCall() const {
+    return ((type == WorkloadType::CallNoGlobals) ||
+            (type == WorkloadType::CallWithGlobals));
+  }
+
+  /// \returns true if the Workload is a Loop.
+  bool isLoop() const {
+    return ((type == WorkloadType::LoopNoGlobals) ||
+            (type == WorkloadType::LoopWithGlobals));
+  }
+
+  /// \returns the function considered for offloading
+  Function *getFunction() const { return F; }
+
+  /// \returns the loop considered for offloading
+  Loop *getLoop() const { return L; }
+
+  /// \returns the Function Call
+  CallInst *getCall() const { return CI; }
+
+  /// \returns the Interface Memory References
+  WorkloadMemRefSet &getInterfaceMemRefs() { return InterfaceMemRefs; }
+
+  /// \returns the Global Variable Memory References
+  WorkloadMemRefSet &getGlobalMemRefs() { return GlobalMemRefs; }
+
+  /// \returns the Global Variable Allocations accessed
+  /// by the workload
+  SetVector<GlobalVariable *> &getGlobalAllocations() {
+    return GlobalAllocations;
+  }
+
+  /// \returns the Heap Memory Allocations accessed
+  /// by the Workload
+  SetVector<CallInst *> &getMallocAllocations() { return MallocAllocations; }
+
+  /// \returns the Stack Memory Allocations accessed
+  /// by the Workload
+  SetVector<AllocaInst *> &getAllocaAllocations() { return AllocaAllocations; }
+
+  private:
+  WorkloadType type; //The Workload Type
+  Function *F; //The function to get offloaded.
+
+  union{
+    CallInst *CI; //The Caller of the Function to get offloaded.
+    Loop *L; //The Loop to get extracted.
+  };
+
+  //Memory References
+  WorkloadMemRefSet InterfaceMemRefs; //passed by the function interface
+  WorkloadMemRefSet GlobalMemRefs; //global references (global vars)
+
+  //Memory Allocations
+  SetVector<GlobalVariable *> GlobalAllocations; //Global Variables
+  SetVector<CallInst *> MallocAllocations; //Heap Allocations
+  SetVector<AllocaInst *> AllocaAllocations; //Stack Allocations
+  friend class WorkloadAnalysis;
+};
+
+typedef std::map<CallInst *, WorkloadInfo *>  CallWorkloadMapT;
+typedef std::map<Loop *, WorkloadInfo *> LoopWorkloadMapT;
+typedef std::map<Function *, WorkloadInfo *> FunctionWorkloadT;
+typedef std::set<std::tuple<Value *, Function *>> FunctionCallersSet;
+
+/// \brief This Pass provides the Workload Analysis for the Heterogeneous
+/// Execution Engine. It provides a high level interface. The user can
+/// query the offloading eligibility of a Function Call or Loop and retrieve
+/// a WorkloadInfo handler that can be used for the offloading transformations.
+class WorkloadAnalysis : public ModulePass {
+public:
+  WorkloadAnalysis();
+  ~WorkloadAnalysis() {};
+  static char ID;
+  /// \brief It requests analyses etc
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
+
+  /// It releases the memory used by the analysis
+  void releaseMemory() override;
+
+  /// \brief This function prints the analyses results.
+  /// It provides the results for both analysis steps,
+  /// (a) Code and Memory Analysis and (b) Memory
+  /// Allocation Use Analysis.
+  ///
+  ///We report:
+  /// Code and Memory Reference Eligibility:
+  ///      Function Workloads
+  ///      Loop Workloads
+  /// Memory Allocation Use Eligibility:
+  ///     Function Call Workloads (Functions as part
+  ///     of a particular Function Call context)
+  ///     Loops Workloads)
+  void print(raw_ostream &O, const Module *M) const override;
+
+  /// \brief  It checks if a particular Function call can be offloaded.
+  /// \returns true if the Function is eligible for offloading.
+  bool isEligibleForOffloading(const CallInst *CI) {
+    return getOffloadingHandler(CI);
+  }
+
+  /// \brief  It checks if a particular Loop can be offloaded.
+  /// \returns true if the Loop is eligible for offloading.
+  bool isEligibleForOffloading(const Loop *L) {
+    return getOffloadingHandler(L);
+  }
+
+  /// \returns a WorkloadInfo handler if the Loop is
+  /// eligible for offloading, otherwise nullptr
+  WorkloadInfo *getOffloadingHandler(const CallInst *CI) {
+    auto WI=CallWorkloadMap.find( const_cast<CallInst *>(CI) );
+    return ( WI !=CallWorkloadMap.end() ) ? WI->second : nullptr;
+  }
+
+  /// \returns a WorkloadInfo handler if the Loop is
+  /// eligible for offloading, otherwise nullptr
+  WorkloadInfo *getOffloadingHandler(const Loop *L) {
+    auto WI=LoopWorkloadMap.find( const_cast<Loop *>(L) );
+    return ( WI !=LoopWorkloadMap.end() ) ? WI->second : nullptr;
+  }
+
+  /// SetForOffloading notifies the analysis state that a particular
+  /// Workload is transformed for Offloading. If that Workload
+  /// is a function it also invalidates  the eligibility of all its
+  /// nested loops. If the Workload is a Loop it invalidates the
+  /// eligibility of its parent function, its nested sub-loops and
+  ///  all the loops that have a level value lower than the Loop.
+  bool setForOffloading(WorkloadInfo *WI);
+
+  /// Transforming a Loop for offloading requires the extraction of the Loop
+  /// to a function before proceeding with the transformations. Calling this
+  /// member function updates the WorkloadInfo handler information after the
+  /// loop extraction.
+  bool MutateLoopToCallWorkload(WorkloadInfo *WI, Function *F, CallInst *CI);
+
+  /// \returns all the Function Calls that are eligible for
+  /// offloading
+  CallWorkloadMapT & getCallWorkloads() { return CallWorkloadMap; }
+
+  /// \returns all the Loops that are eligible for offloading
+  LoopWorkloadMapT & getLoopWorkloads() { return LoopWorkloadMap; }
+
+  /// \returns a TargetLibraryInfo instance
+  TargetLibraryInfo *getTLI() { return TLI; }
+
+private:
+  // Documentation for the private member functions is available
+  // in the implementation file.
+
+  void analyzeCodeMemRef(Function &F, FunctionWorkloadT &FunctionWorkloadMap,
+                         LoopWorkloadMapT &EligibleLoopWorkloadMap);
+  bool analyzeCodeMemRefEligibility(Function *F,
+                                    FunctionWorkloadT &FunctionWorkloadMap);
+  bool analyzeCodeMemRefEligibility(Loop *L,
+                                    LoopWorkloadMapT &EligibleLoopWorkloadMap);
+
+  void analyzeMemAllocUseEligibility(Function *F, WorkloadInfo *WI,
+                                     FunctionCallersSet &Callers);
+  void analyzeMemAllocUseEligibility(Loop *L, WorkloadInfo *WI,
+                                     FunctionCallersSet &Callers);
+
+  // Workload of Function and Loops that passed the first step of
+  // the Analysis (Code and Memory Reference Analysis);
+  FunctionWorkloadT EligibleFunctionWorkloadMap;
+  LoopWorkloadMapT  EligibleLoopWorkloadMap;
+
+  // Workload of Function Calls and Loops that passed the second
+  // step of the Analysis (Memory Allocation Use Analysis);
+  LoopWorkloadMapT LoopWorkloadMap;
+  CallWorkloadMapT CallWorkloadMap;
+
+  // Support Data Structures
+  std::map<Function *, std::set<Loop *>> ELoops;
+  std::map<Function *, std::set<CallInst *>> ECalls;
+  std::vector<WorkloadInfo *> GarbageCollector;
+
+  // Analyses used by the Workload Analysis
+  AliasAnalysis *AA;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  CallGraph *CG;
+  TargetLibraryInfo *TLI;
+  std::vector<LoopInfo> LoopInfoCache;
+};
+
+}
+
+#endif //LLVM_TRANSFORMS_HEXE_WORKLOADANALYSIS_H
Index: include/llvm/Transforms/Hexe/WorkloadExtractor.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/WorkloadExtractor.h
@@ -0,0 +1,296 @@
+//===---------- Transforms/Hexe/WorkloadExtractor.h - Hexe -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// ===- High-Level Description of the Workload Extraction Utils and Pass - ===
+/// Key Goal: Design and implement utility classes and a compiler pass that
+/// extract loops or functions for offloading. The Workload Analysis provides
+/// Workload Information handlers which describe the eligible Loops and
+/// Functions.
+///
+/// Key Points:
+///     a) Function/Loop Code Extraction. We need to extract the workload code
+///     in a new Module that will be compiled for the target accelerators.
+///     However we need to remain accelerator agnostic for portability reasons.
+///
+///     b) Preserve the original version of the workload on the host module. An
+///     accelerator may not be available for use during the application
+///     execution for various reasons (occupied, disabled etc). We should be
+///     able to fall back on CPU execution.
+///
+///     c) Inject the neccessary runtime calls and control flow. A runtime
+///     library controls the scheduling between the CPU and the accelerator,
+///     coherency and dispatch on the accelerator.
+///
+///     d) Replace the memory allocation method of the memory allocations that
+///     are accessed by workloads that will be offloaded to the accelerator.
+///     This is done by using memory management functions provided by the
+///     runtime library.
+///     We may have to replace:
+///             1) Heap Allocations (e.g. Malloc function calls), trivial
+///             2) Stack Allocations, (calloc), a bit tricky. We have to
+///             replace a stack allocation with a library memory allocation,
+///             meaning that we have to release that memory explicitly when
+///             it is not required anymore.
+///             3) Global Variables, a bit tricky again. Global variables are
+///             allocated in the data segment of the process address space.
+///             In our scheme we need to replace their allocation method to
+///             use memory provided by the Hexe runtime library calls. We do
+///             that by emitting  constructor and destructor functions that
+///             use the library calls to allocate memory for the Global
+///             Variables. These constructors and destructors are automatically
+///             called at process initialization and exit.
+///
+/// This file provides the declaration of three classes:
+/// a) HexeWorkload encapsulates the new module we use to clone workloads
+/// that are extracted for offloading. It provides operations for adding new
+/// functions, inserting the required Hexe Metadata and enforcing module
+/// validity.
+///
+/// b) WorkloadExtractUtil is the class that takes care of the actual code
+/// transformations. It exposes a simple interface.
+///
+/// c) WorkloadExtractor is a Module pass that uses the WorkloadExtractUtil
+/// functionality and transforms the code to support Workload Offloading. The
+/// user can control its behavior via command line flags. It can be considered
+/// as a basic concept pass that can be extended to serve a specific use case.
+///
+/// Some Terminology:
+///     Host: the main system platform, the processor that runs the Operating
+///     System and the main application code.
+///
+///     Accelerator: the co-processor where we offload workloads for
+///     computation.
+///
+///     Host Module: The original code module that gets transformed to support
+///     workload offloading and gets compiled and run on the host platform.
+///
+///     Hexe Module: The Module where we extract Workloads for which we enable
+///     offloading. This module gets compiled to every accelerator target. It
+///     is accelerator agnostic. Specializing for a specific accelerator target
+///     is job of the Workload Transform Pass.
+///
+/// FUTURE PLANS (TODO):
+///     a) Make workload dispatch asynchronous. Design of the runtime library is
+///     done. Work is required on the compiler analysis.
+///     b) Speculation and Runtime Checks. Work is required on both compiler and
+///     runtime library.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_HEXE_WORKLOADEXTRACTOR_H
+#define LLVM_TRANSFORMS_HEXE_WORKLOADEXTRACTOR_H
+
+#include <llvm/ADT/SetVector.h>
+#include <llvm/ADT/Triple.h>
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <tuple>
+
+namespace llvm {
+
+class WorkloadAnalysis;
+class DataLayout;
+class WorkloadInfo;
+class DominatorTree;
+class AllocaInst;
+class TargetLibraryInfo;
+class HexeWorkload;
+
+typedef std::tuple< Function *, unsigned, StructType *> HexeFunctionStructInfoT;
+typedef std::map<Function *, HexeFunctionStructInfoT> FunctionHexeFunctionMapT;
+
+/// HexeWorkload encapsulates the new module we use to save workloads
+/// that are extracted for offloading. It provides operations for
+/// adding new functions, inserting the required Hexe Metadata and enforcing
+/// module validity.
+class HexeWorkload {
+public:
+  /// \brief The standard constructor
+  HexeWorkload(LLVMContext &Context, StringRef MName="Hexe_Workload");
+  ~HexeWorkload() { delete M; delete DL; delete TargetTriple; }
+
+  /// \brief sets the DataLayout and the Triple of the host platform.
+  void setDLandTriple(const DataLayout &DL, const Triple &TargetTriple);
+
+  /// \brief inserts Hexe specific Metadata and writes the Module to a
+  /// file.
+  void writeFile(StringRef Filename);
+
+  /// \brief inserts Hexe specific Metadata to the Module
+  void writeModule(Module *M);
+
+  /// \brief  We support a limited number of function calls
+  void validateSupportedFunctionCalls();
+
+  /// \brief Adds a new Function to the Hexe Workload Module
+  ///
+  /// \returns a newly created Function that is part of the
+  /// Hexe Workload, a unique Hexe ID and a StructType generated
+  /// by the function interface.
+  HexeFunctionStructInfoT addFunctionPrototype(Function *F, FunctionType *FT);
+
+  /// \brief It checks if there is Mapping for the Host Function F
+  ///
+  /// \returns the corresponding Function that is part of the
+  ///  Hexe Workload, a unique Hexe ID and a StructType generated
+  ///  by the function interface, if there is a mapping.
+  /// Otherwise nullptr.
+  HexeFunctionStructInfoT * getMapping(const Function *F)
+  {
+    auto I=Mapping.find( const_cast<Function *>(F) );
+    return ( I!=Mapping.end() ) ? &(I->second) : nullptr;
+  }
+
+  /// \brief It checks if there is a mapping for the  host
+  /// Function F in the Hexe Workload.
+  ///
+  /// \returns true if exists
+  bool hasMapping(const Function *F) { return getMapping(F); }
+
+  /// \returns the LLVM Context
+  LLVMContext &getContext() { return C; }
+
+  /// \returns the DataLayout
+  DataLayout *getDataLayout() { return DL; }
+
+  /// \returns the Triple
+  Triple *getTriple() { return TargetTriple; }
+
+private:
+  LLVMContext &C; // LLVM Context
+  StringRef MSID; // Hexe Module String ID
+  DataLayout *DL; // DataLayout of the Host platform
+  Triple *TargetTriple;
+  Module *M;
+  unsigned nextFunctionID;
+
+  // Host Function Mapping to the Hexe Workload Function, its Hexe ID
+  // and the StructType generated by the original function interface.
+  FunctionHexeFunctionMapT Mapping;
+};
+
+
+/// This class performs all the necessary code transformations for
+/// enabling the offloading of one or more workloads to an accelerator.
+/// The Workload Analysis provides WorkloadInfo handlers that are used
+/// by this class to perform the necessary transformations.
+class WorkloadExtractUtil {
+public:
+  /// \brief The Standard Constructor
+  /// \param WA: Workload Analysis
+  /// \param HM: Host Module, the original code module that will be
+  /// compiled for the host architecture.
+  /// \param Host Pass, the pass that utilizes this class object.
+  WorkloadExtractUtil(WorkloadAnalysis *WA, Module *HM, Pass *HP,
+                      bool SupportGlobals=true) {
+    this->WA=WA;
+    this->HM=HM;
+    this->HP=HP;
+    setHexeRTFunctions();
+  }
+
+  /// \brief Provide the HexeWorkload object which will
+  /// store the extracted workload code.
+  void registerHexeWorkload(HexeWorkload *HW) {
+    this->HW=HW;
+  }
+
+  /// \brief It performs the necessary code transformations
+  /// to enable the offloading of a Workload to the accelerator.
+  /// The WorkloadInfo handler should be given by the Workload
+  /// Analysis. A Workload can either be a Function Call or a
+  /// Loop.
+  bool extractWorkloadCode(WorkloadInfo *WI);
+
+  /// It performs the necessary replacement of the Memory Allocations
+  /// that are used by Workloads that have been transformed for
+  /// offloading. Important: Call this only after having completed all
+  /// the required extractWorkloadCode calls.
+  bool replaceMemAllocations();
+
+private:
+  // Documentation for the private member functions is available
+  // in the implementation file.
+
+  bool loopToFunction(WorkloadInfo *WI, WorkloadInfo *CW, DominatorTree *DT);
+  Function *cloneOrGetHexeFunction(WorkloadInfo *WI);
+  bool transformAndInjectGlueCode(WorkloadInfo *WI);
+
+  Value *injectRuntimeSched(Instruction *II,
+                            CallInst *CI, WorkloadInfo *WI);
+  Instruction *marshalAndOffload(Instruction *II,
+                                 CallInst *CI, WorkloadInfo *WI);
+
+  void setHexeRTFunctions();
+  void annotateMemAllocationsForReplacement(WorkloadInfo *WI);
+
+
+  WorkloadAnalysis *WA; //Hexe Workload Analysis
+  Module *HM; //Host Module
+  Pass *HP; //Host Pass
+  HexeWorkload *HW; //Hexe Workload Module
+
+  // Hexe Runtime Function Declarations
+  Function *HexeDispatchCall;
+  Function *HexeCoherencyCall;
+  Function *HexeRuntimeSchedCall;
+  Function *HexeEventWaitCall;
+  Function *HexeMalloc;
+  Function *HexeFree;
+
+  // Hexe Datatypes
+  StructType *HexeEventT;
+  StructType *MemoryAccessInfoT;
+  StructType *HexeKernelInfoT;
+
+  // Memory Allocations used by workloads
+  // that need to be transformed to use
+  // Hexe memory allocation facilities.
+  SetVector<CallInst *> MallocAllocations;
+  SetVector<GlobalVariable *> GlobalAllocations;
+  SetVector<AllocaInst *> AllocaAllocations;
+};
+
+/// Available Access Modes for Memory Accesses
+/// performed by the Workloads.
+enum class MemAccessInfoAccessMode {
+  Read,
+  Write,
+  ReadWrite
+};
+
+/// WorkloadExtractor is a Module pass that uses the WorkloadExtractUtil
+/// functionality and transforms the code to support Workload Offloading. The
+/// user can control its behavior via command line flags. It can be considered
+/// as a basic concept pass that can be extended to serve a specific use case.
+class WorkloadExtractor : public ModulePass {
+public:
+  static char ID;
+  WorkloadExtractor();
+  ~WorkloadExtractor() {};
+
+  /// \brief It requests analyses etc
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// \brief It performs transformations that enable workload offloading.
+  /// The transformations can be controlled via the HexeFunctionCalls,
+  /// HexeLoops and HexePolicy flags.
+  bool runOnModule(Module &M) override;
+
+  /// \brief It releases the memory used by the Extractor Pass.
+  void releaseMemory() override;
+
+private:
+  WorkloadAnalysis *WA; //Hexe Workload Analysis
+  HexeWorkload *HW; //Hexe Workload Module
+  WorkloadExtractUtil *WEU; //Workload Extraction Utility
+};
+
+}
+
+#endif
Index: include/llvm/Transforms/Hexe/WorkloadTransform.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Hexe/WorkloadTransform.h
@@ -0,0 +1,68 @@
+//===-------- Transforms/Hexe/WorkloadTransform.h - Hexe ----- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// ===------- High-Level Description of the Workload Transform Pass -------===
+/// Key Goal: Design and implement a compiler pass that converts the Hexe
+/// Workload we extract from host code modules to comply with the conventions
+/// and limitations of particular accelerator environment.
+///
+/// Depending the accelerator and its programming environment, we may have
+/// to work on:
+///     Function Interfaces
+///     Enviroment Function Call Injection
+///     ABI conversions
+///     Big-Little Endianess
+///     32 vs 64 bit architectures.
+///
+/// A modular design has been adopted. A virtual Adaptor class has been
+/// designed. For every new convention that needs to be supported the
+/// developer has to sub-class that interface, The interface is available
+/// at: HeterogeneousAdaptors/Adaptors.h
+///
+/// Future Plans (TODO):
+///     a) Write an Adaptor for OpenCL/SPIR
+///     b) Investigate more accelerator types and build a codebase of utility
+///     classes that can be used across different acceleratos and conventions
+///      This pass in early stage.
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_HEXE_WORKLOADTRANSFORM_H
+#define LLVM_TRANSFORMS_HEXE_WORKLOADTRANSFORM_H
+
+#include "llvm/Pass.h"
+#include <tuple>
+#include <vector>
+
+namespace llvm {
+
+typedef std::tuple<Function *, unsigned> HexeFunctionInfoT;
+typedef std::vector<HexeFunctionInfoT> HexeFunctionInfoListT;
+
+//Pass Code
+class WorkloadTransform : public ModulePass {
+public:
+  WorkloadTransform();
+  ~WorkloadTransform(){};
+  static char ID;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// \brief Initialize the right Adaptor and performs the required
+  /// code transformations
+  bool runOnModule(Module &M) override;
+
+  //\brief It release the memory used by the pass
+  void releaseMemory() override;
+private:
+  HexeFunctionInfoListT FL;
+  unsigned functionNum;
+};
+
+}
+
+#endif //LLVM_TRANSFORMS_HEXE_WORKLOADTRANSFORM_H
Index: lib/Transforms/CMakeLists.txt
===================================================================
--- lib/Transforms/CMakeLists.txt
+++ lib/Transforms/CMakeLists.txt
@@ -6,3 +6,4 @@
 add_subdirectory(Vectorize)
 add_subdirectory(Hello)
 add_subdirectory(ObjCARC)
+add_subdirectory(Hexe)
Index: lib/Transforms/Hexe/CMakeLists.txt
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMHexe
+    Hexe.cpp
+    WorkloadAnalysis.cpp
+    WorkloadExtractor.cpp
+    WorkloadTransform.cpp
+    Utils.cpp
+    HeterogeneousAdaptors/Hexagon.cpp
+  )
+
+add_dependencies(LLVMHexe LLVMIRReader intrinsics_gen)
Index: lib/Transforms/Hexe/HeterogeneousAdaptors/Hexagon.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/HeterogeneousAdaptors/Hexagon.cpp
@@ -0,0 +1,384 @@
+// === --- Hexagon.cpp - Heterogeneous Execution Engine ------*- C++ -*-=== //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// === -----------------------------------------------------------------=== //
+/// \file
+/// This file contains the definition of the Hexagon Adaptor of the
+/// Heterogeneous Execution Engine. This Adaptor supports convention
+/// transformations for the Qualcomm Hexagon DSP.
+///
+/// It supports two modes:
+/// a) Host Architecture: arm 32 and Accelerator Architecture: hexagon. This
+/// cooperates with the hexagon plugin of the Hexe runtime library.
+//
+/// b) The same Host and Accelerator architecture. This cooperates with the
+/// accelerator debug plugin of the Hexe runtime library and servers debugging
+/// purposes. It can be used to evaluate and test Hexe compiler and runtime
+/// functionality without the need of an accelerator at all.
+// === -----------------------------------------------------------------=== //
+
+
+
+#include "llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h"
+#include "llvm/Transforms/Hexe/Utils.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+// Hexagon Adaptor Conventions.
+//
+// The host runtime/driver calls the following function:
+//
+//      int __hexe_skel_invoke(uint32 handler, remote_buf* bufs);
+//
+//      a) The first argument is an integer handler which describes
+//      which computation function needs to be called and how
+//      to parse the second argument.
+//
+//      b)The second argument is an array of struct elements of the following
+//      type:
+//
+//      struct remote_buf{
+//        void *pointer;
+//        size_t size;
+//      };
+//
+//      This struct represents a memory segment that is shared between the
+//      host and the accelerator environment where the runtime and driver
+//      environment translates the pointer address to a valid one.
+//
+//      We reserve the first element of the array for storing scalar
+//      arguments and the return value. The rest of the elements represent
+//      memory segments that are accessed by the computational function.
+//
+// According to our convention each computational function must have the
+//      following interface:
+//
+//      int comp_function_interface(remote_buf* bufs);
+//
+//      It takes a single argument, the array of shared segments. For each
+//      computational function we need to generate code that does the
+//      appropriate marshalling.
+//
+//      The following macro is used to choose which computational function
+//      should be called from the __hexe_skel_invoke.
+//
+//      #define CHOOSE_COMP_FUNCTION(hanlder)  (((handler) >> 24) & 0x1f)
+//
+// Example:
+//
+//      int hexe_skel_invoke(uint32 handler, remote_buf* bufs)
+//      {
+//        int functionID = CHOOSE_COMP_FUNCTION(handler);
+//
+//        switch (functionID) {
+//         case 0:
+//           return compFunction0(bufs);
+//         case 1:
+//           return compFunction1(bufs);
+//         default:
+//           return 20; //error
+//        }
+//      }
+//
+//      All the functions should return 0 on success and 20 in case of error.
+
+
+/// Adaptor for the Hexagon convention
+class HexagonWorkloadAdaptor : public HexeWorkloadAdaptor {
+  public:
+    HexagonWorkloadAdaptor() { }
+    ~HexagonWorkloadAdaptor() { }
+
+    void transform(Module *M, const Triple &HTriple, const Triple &ATriple,
+                   const HexeFunctionInfoListT &FL) override;
+
+    bool isSupported(const Triple &HTriple, const Triple &ATriple) override;
+
+  private:
+    /// \brief It transforms the computational functions of the Module to
+    /// have a compatible interface. It also generates the marshalling code
+    /// for reading the scalar and pointer arguments from the input
+    /// buffers.
+    void transformFunctions(Module *M,
+                            const HexeFunctionInfoListT &IFL,
+                            HexeFunctionInfoListT &OFL);
+
+    /// \brief It transforms a computational function of the Module to
+    /// have a compatible interface. It also generates the marshalling code
+    /// for reading the scalar and pointer arguments from the input
+    /// buffers.
+    Function *transformFunction(Function *F);
+
+    /// \brief It generates the skeleton function that the host runtime/
+    /// driver calls. This function then calls the requrest computational
+    /// function.
+    Function *generateSkelInvoke(Module *M, const HexeFunctionInfoListT &FL);
+    void DataLayoutTripleUpdate(Module *M, Triple HTriple, Triple ATriple);
+    Type *BufferST;
+    Type *BufferSTP;
+};
+
+
+
+void HexagonWorkloadAdaptor::transform(Module *M, const Triple &HTriple,
+  const Triple &ATriple, const HexeFunctionInfoListT &FL)
+{
+  LLVMContext &C = M->getContext();
+  Type *VoidPT = Type::getInt8PtrTy(C);
+  Type *Int32 = Type::getInt32Ty(C);
+
+  //Buffer Struct Type
+  Type *ET[] = { VoidPT, Int32 };
+  BufferST = StructType::create( ArrayRef<Type *>(ET, 2) );
+  BufferSTP = PointerType::getUnqual(BufferST);
+
+  //It transforms the functions
+  HexeFunctionInfoListT OFL;
+  transformFunctions(M, FL, OFL);
+
+  //It generates the Skel function
+  generateSkelInvoke(M, OFL);
+
+  //updates the DataLayout and The Triple of the module
+  DataLayoutTripleUpdate(M, HTriple, ATriple);
+}
+
+void HexagonWorkloadAdaptor::transformFunctions(Module *M,
+                                                const HexeFunctionInfoListT &IFL,
+                                                HexeFunctionInfoListT &OFL)
+{
+  for(auto I = IFL.begin(), IE = IFL.end(); I!= IE; ++I) {
+    Function *F;
+    unsigned FID;
+    std::tie(F, FID) = *I;
+    OFL.push_back(std::make_tuple(transformFunction(F), FID));
+  }
+}
+
+
+Function *HexagonWorkloadAdaptor::transformFunction(Function *F)
+{
+  Module *M = F->getParent();
+  LLVMContext &C = M->getContext();
+  Type *Int32 = Type::getInt32Ty(C);
+  ConstantInt *Zero = ConstantInt::get(Type::getInt32Ty(C), 0);
+  FunctionType *OFT = F->getFunctionType();
+  Type *OFST = getCompactFunctionStruct(OFT, C);
+  Type *OFSTP = PointerType::getUnqual(OFST);
+
+  //function_interface
+  FunctionType *UserFT = FunctionType::get(Int32,
+    ArrayRef<Type *>(BufferSTP), false);
+  Function *NF = Function::Create(UserFT, GlobalValue::InternalLinkage,
+                                F->getName()+"_compact", M);
+  Value *APointer = NF->arg_begin();
+
+
+  //Create the marshalling basic block where the marshaslling
+  //operations take place
+  BasicBlock *BB = BasicBlock::Create(C, "marshallingblock", NF);
+
+  //get the address of a particular buffer
+  auto getBufferAddr = [ &APointer, &C, &BB, &Zero ]
+    (unsigned Indx) {
+    Value *Indices[] = {
+      ConstantInt::get(Type::getInt32Ty(C), Indx),
+      Zero
+    };
+    auto P =
+      GetElementPtrInst::CreateInBounds(APointer,
+                                        ArrayRef<Value *>(Indices, 2), "", BB);
+    return new LoadInst(P, "", BB);
+  };
+
+  Instruction *LA = getBufferAddr(0);
+  Instruction *ArgStructP = CastInst::CreatePointerCast(
+      LA, OFSTP, "", BB);
+
+  //get scalar Call Arg (The first buffer is reserved to contain the scalar
+  //arguments)
+  auto getCallArg = [ &ArgStructP, &C, &BB, &Zero ]
+    (unsigned Indx) {
+    Value *Indices[] = { Zero,
+      ConstantInt::get(Type::getInt32Ty(C), Indx)
+    };
+    auto P =
+      GetElementPtrInst::CreateInBounds(ArgStructP,
+                                        ArrayRef<Value *>(Indices, 2), "", BB);
+    return new LoadInst(P, "", BB);
+  };
+
+  //read scalar or pointer arguments
+  unsigned BIndex = 1;
+  SmallVector<Value *, 8> Args;
+  for (unsigned I = 0, IE = OFT->getNumParams(); I<IE; ++I) {
+    Type *PT = OFT->getParamType(I);
+    if (PT->isPtrOrPtrVectorTy()) {
+      Instruction *BA = getBufferAddr(BIndex++);
+      Instruction *CBA = CastInst::CreatePointerCast(BA, PT, "", BB);
+      Args.push_back(CBA);
+    } else {
+      Args.push_back( getCallArg(I) );
+    }
+  }
+
+  //update return valuea lambda
+  auto updateReturnValue = [ &ArgStructP, &C, &BB, &Zero ]
+    ( unsigned Indx, Value *V) {
+    Value *Indices[] = {
+      Zero,
+      ConstantInt::get(Type::getInt32Ty(C), Indx)
+    };
+    auto P = GetElementPtrInst::CreateInBounds(ArgStructP,
+        ArrayRef<Value *>(Indices, 2), "", BB);
+    return new StoreInst(V, P, BB);
+  };
+
+
+  //update the return value
+  CallInst *CI = CallInst::Create(F, Args, "", BB);
+  if (!OFT->getReturnType()->isVoidTy())
+    updateReturnValue(OFT->getNumParams(), CI);
+
+  ReturnInst::Create(C, Zero, BB);
+
+  //inline the original function in the newly generated
+  //function
+  InlineFunctionInfo IFI;
+  bool rv = InlineFunction(CI, IFI);
+  assert(rv);
+
+  F->eraseFromParent();
+  return NF;
+}
+
+/// \brief creates the instructions that compute the computational
+/// function index. This index is used to call the requested function.
+///
+///  #define CHOOSE_COMP_FUNCTION(hanlder)  (((handler) >> 24) & 0x1f)
+static Value *injectMethodIndexCompute(LLVMContext &C, Value *dwScalars,
+                                       BasicBlock *IB)
+{
+  ConstantInt *CShifts = ConstantInt::get(Type::getInt32Ty(C), 24);
+  ConstantInt *CMask = ConstantInt::get(Type::getInt32Ty(C), 0x1f);
+
+  Value *V =
+    BinaryOperator::Create(Instruction::Shl, dwScalars, CShifts, "", IB);
+  return BinaryOperator::Create(Instruction::And, V, CMask, "", IB);
+}
+
+
+/// \brief it generates a switch case mapping a function index to a
+/// function call.
+static void injectSwitchCaseCode(LLVMContext &C,  SwitchInst *SWI,
+                                 Value *BufferPArg, unsigned FIndex,
+                                 Function *CF, Function *HF)
+{
+  ConstantInt *FIndexVal = ConstantInt::get(Type::getInt32Ty(C), FIndex);
+  BasicBlock *BB = BasicBlock::Create(C, "", HF);
+  CallInst *CI = CallInst::Create(CF, BufferPArg, "", BB);
+
+  ReturnInst::Create(C, CI, BB);
+  SWI->addCase(FIndexVal, BB);
+}
+
+Function *HexagonWorkloadAdaptor::generateSkelInvoke(Module *M,
+  const HexeFunctionInfoListT &FL)
+{
+  LLVMContext &C = M->getContext();
+  Type *Int32 = Type::getInt32Ty(C);
+
+  //error value
+  Constant *CError = ConstantInt::get(Int32, 20);
+
+  //hexe_skel_invoke_type
+  Type *IT[] = { Int32, BufferSTP };
+  FunctionType *SkelInvokeFT =
+    FunctionType::get(Int32, ArrayRef<Type *>(IT, 2), false);
+
+  //create the SkelInvoke function and add a switch
+  Function *SkelF =
+    Function::Create(SkelInvokeFT, GlobalValue::ExternalLinkage,
+                     "__hexe_skel_invoke", M);
+
+  auto I = SkelF->arg_begin();
+  Value *dwScalarsV = I;
+  ++I;
+  Value *BufferPV = I;
+
+  BasicBlock *BB = BasicBlock::Create(C, "", SkelF);
+  Value *MIndexV = injectMethodIndexCompute(C, dwScalarsV, BB);
+
+  BasicBlock *SwitchDefault = BasicBlock::Create(C, "", SkelF);
+  ReturnInst::Create(C, CError, SwitchDefault);
+  SwitchInst *SWI = SwitchInst::Create(MIndexV, SwitchDefault, FL.size(), BB);
+
+  for (auto I = FL.begin(), IE = FL.end(); I!= IE; ++I) {
+    Function *F;
+    unsigned Index;
+    std::tie(F, Index) = *I;
+    injectSwitchCaseCode(C, SWI, BufferPV, Index, F, SkelF);
+  }
+
+
+  return SkelF;
+}
+
+bool HexagonWorkloadAdaptor::isSupported(const Triple &HTriple,
+                                         const Triple &ATriple)
+{
+  //host and accelerator same triple (used for debug)
+  if ((HTriple.getArch() == ATriple.getArch()) &&
+      (HTriple.getOS() == ATriple.getOS()) &&
+      (HTriple.getEnvironment() == ATriple.getEnvironment()))
+    return true;
+
+  //arm 32bits, linux standard gnu abi
+  if ((HTriple.getArch() == Triple::arm) &&
+      (HTriple.getOS() == Triple::Linux) &&
+      (HTriple.getEnvironment() == Triple::GNUEABI))
+    return true;
+
+  return false;
+}
+
+void HexagonWorkloadAdaptor::DataLayoutTripleUpdate(Module *M, Triple HTriple,
+                                                    Triple ATriple)
+{
+  if ((HTriple.getArch() == ATriple.getArch()) &&
+      (HTriple.getOS() == ATriple.getOS()) &&
+      (HTriple.getEnvironment() == ATriple.getEnvironment())) {
+    M->setTargetTriple(HTriple.str());
+    return;
+  }
+
+  if ((HTriple.getArch() == Triple::arm) &&
+      (HTriple.getOS() == Triple::Linux) &&
+      (HTriple.getEnvironment() == Triple::GNUEABI) &&
+      (ATriple.getArch() == Triple::hexagon)) {
+    M->setTargetTriple(ATriple.str());
+    DataLayout D("e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32");
+    M->setDataLayout(D.getStringRepresentation());
+  }
+}
+
+namespace llvm{
+
+HexeWorkloadAdaptor *createHexagonWorkloadAdaptor()
+{
+  return new HexagonWorkloadAdaptor();
+}
+
+}
Index: lib/Transforms/Hexe/Hexe.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/Hexe.cpp
@@ -0,0 +1,57 @@
+//===-------- Hexe.cpp - Heterogeneous Execution Engine --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines the initialization function of the Heterogeneous
+/// Execution Engine and the command line flags that control the engine
+/// operations.
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+
+/// initializeHexe - Initialize all passes linked into Hexe library.
+void llvm::initializeHexe(llvm::PassRegistry &Registry)
+{
+  llvm::initializeWorkloadAnalysisPass(Registry);
+  llvm::initializeWorkloadExtractorPass(Registry);
+  llvm::initializeWorkloadTransformPass(Registry);
+}
+
+namespace llvm {
+cl::opt<bool> HexeLoops(
+    "hexe-loops",
+    cl::desc("Activates all the eligible loops for offloading (Hexe)"),
+    cl::init(false));
+
+cl::opt<bool> HexeFunctionCalls(
+    "hexe-functioncalls",
+    cl::desc("Activates all the eligible function calls for offloading"
+      " (Hexe)"),
+    cl::init(false));
+
+cl::opt<std::string> HexePolicy(
+    "hexe-policy", cl::desc("Defines Hexe Offloading Policy"),
+    cl::value_desc("hexe policy name"), cl::init("") );
+
+cl::opt<std::string> HexeWorkloadFName(
+    "hexe-workload-fname", cl::desc("Define Hexe Workload filename"),
+    cl::value_desc("hexe workload filename"),
+    cl::init("hexe_workload.ll") );
+
+cl::opt<std::string> HexeAdaptor(
+    "hexe-adaptor", cl::desc("Defines Hexe Accelerator Adaptor"),
+    cl::value_desc("hexe policy name"), cl::init("") );
+
+cl::opt<bool> HexeAdaptorCheck(
+    "hexe-adaptor-check",
+    cl::desc("Checks the compatibility of the Adaptor (Hexe)"),
+    cl::init(true));
+}
Index: lib/Transforms/Hexe/Utils.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/Utils.cpp
@@ -0,0 +1,145 @@
+// ===----- Utils.cpp - Heterogeneous Execution Engine --------*- C++ -*-=== //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===-------------------------------------------------------------------=== //
+/// \file
+/// This file defines utility functions that:
+/// a)create Struct Types based on function interfaces.
+/// b)Hexe Metadata read and write functions.
+/// c)erase Hexe Metadata from a Module
+// ===-------------------------------------------------------------------=== //
+
+
+#include "llvm/Transforms/Hexe/Utils.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+
+//the operation of the functions is documented in the header file.
+
+namespace llvm{
+
+StructType *getCompactFunctionStruct(FunctionType *FT, LLVMContext &C)
+{
+  SmallVector<Type *, 8> StructElemTypes(FT->params().begin(),
+                                         FT->params().end());
+
+  Type *RT = FT->getReturnType();
+  if (!RT->isVoidTy())
+    StructElemTypes.push_back(RT);
+
+  return StructType::create(StructElemTypes);
+}
+
+/// \brief writeFunctionInfoMD writes information about a specific function
+/// manipulated by Hexe as metadata.
+static void writeFunctionInfoMD(LLVMContext &C,
+  const HexeFunctionStructInfoT &FI, NamedMDNode *NN)
+{
+  Metadata *FM = ConstantAsMetadata::getConstant(std::get<0>(FI));
+  Metadata *Findex =
+    ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C),
+                            std::get<1>(FI)));
+
+  Metadata *MDA[] = { FM, Findex };
+  MDNode *N = MDNode::get(C, ArrayRef<Metadata *>(MDA, 2) );
+  NN->addOperand(N);
+}
+
+
+void readHexeMetadata(Module *M, unsigned &NumHexeFunctions,
+  HexeFunctionInfoListT &FL, Triple &HTriple)
+{
+  NamedMDNode *HI = M->getNamedMetadata("hexe.info");
+
+  //read Number of Hexe Functions
+  assert(HI);
+  ConstantAsMetadata *FN =
+    dyn_cast<ConstantAsMetadata>(HI->getOperand(0)->getOperand(0));
+  assert(FN);
+  ConstantInt *CI = dyn_cast<ConstantInt>(FN->getValue());
+  assert(CI);
+  NumHexeFunctions = CI->getZExtValue();
+
+  //read host platform triple
+  NamedMDNode *HHT = M->getNamedMetadata("hexe.host_triple");
+  assert(HHT);
+  MDString *HTS = dyn_cast<MDString>(HHT->getOperand(0)->getOperand(0));
+  assert(HTS);
+  HTriple.setTriple(HTS->getString());
+
+  NamedMDNode *HFL = M->getNamedMetadata("hexe.function_list");
+  assert(HFL);
+  assert(NumHexeFunctions == HFL->getNumOperands());
+
+  //read Function List Metadata
+  for (unsigned I = 0, IE = HFL->getNumOperands(); I<IE; ++I) {
+    MDNode *MD = HFL->getOperand(I);
+    ConstantAsMetadata *FM = dyn_cast<ConstantAsMetadata>(MD->getOperand(0));
+    Function *F = dyn_cast<Function>( FM->getValue() );
+
+    ConstantAsMetadata *FIDM = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+    ConstantInt *FID = dyn_cast<ConstantInt>(FIDM->getValue());
+    FL.push_back(std::make_tuple(F, FID->getZExtValue()));
+  }
+
+}
+
+void writeHexeMetadata(Module *M, unsigned NumHexeFunctions,
+  const FunctionHexeFunctionMapT &Mapping, const Triple &HTriple)
+{
+  LLVMContext &C = M->getContext();
+
+  NamedMDNode *HI = M->getOrInsertNamedMetadata("hexe.info");
+  assert(HI);
+
+  //write Number of Hexe Functions
+  HI->dropAllReferences();
+  Metadata *NumOfFunctions =
+    ConstantAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(C), NumHexeFunctions));
+
+  HI->addOperand(MDNode::get(C, NumOfFunctions));
+
+  //write host platform triple
+  NamedMDNode *HHT = M->getOrInsertNamedMetadata("hexe.host_triple");
+  assert(HHT);
+
+  HHT->dropAllReferences();
+  Metadata *HTripleS = MDString::get(C, HTriple.str());
+  HHT->addOperand(MDNode::get(C, HTripleS));
+
+  //write function metadata
+  NamedMDNode *HFL = M->getOrInsertNamedMetadata("hexe.function_list");
+  assert(HFL);
+
+  std::vector< std::tuple< Function *, unsigned, StructType *> > FIV;
+  FIV.resize(Mapping.size());
+
+  for (auto I = Mapping.begin(), IE = Mapping.end(); I!= IE; ++I){
+    size_t index = std::get<1>(I->second);
+    FIV[index] = I->second;
+  }
+
+  for (auto I = FIV.begin(), IE = FIV.end(); I!= IE; ++I)
+    writeFunctionInfoMD(C, *I, HFL);
+}
+
+void eraseHexeMetadata(Module *M)
+{
+  //gets and erase Metadata
+  auto removeMD = [ &M ]( StringRef MDN) {
+    M->eraseNamedMetadata(M->getNamedMetadata(MDN));
+  };
+
+  removeMD("hexe.info");
+  removeMD("hexe.host_triple");
+  removeMD("hexe.function_list");
+}
+
+}
Index: lib/Transforms/Hexe/WorkloadAnalysis.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/WorkloadAnalysis.cpp
@@ -0,0 +1,968 @@
+// ===-- WorkloadAnalysis.cpp - Heterogeneous Execution Engine --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// === -------------------------------------------------------------------===//
+/// \file
+/// Implementation of the Workload Analysis Pass of the Heterogeneous
+/// Executon Engine. Please read the header file documentation for high
+/// level description.
+// === -------------------------------------------------------------------=== //
+
+
+#include "llvm/Transforms/Hexe/WorkloadAnalysis.h"
+#include "llvm/Transforms/Hexe/InitializeHexePasses.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include <algorithm>
+#include <set>
+#include <stack>
+#include <utility>
+
+#define DEBUG_TYPE "hwa"
+
+using namespace llvm;
+
+/// \brief isEligibleArray checks if we support a particular
+/// Array Type.
+///
+/// \returns true if the Array Type is supported.
+static bool isEligibleArray(const Type *T);
+
+/// \brief isScalarType checks if T is a scalar type.
+///
+/// \returns true on success.
+static bool isScalarType(const Type *T)
+{
+  return T->isFloatTy() || T->isDoubleTy() || T->isIntegerTy();
+}
+
+/// \brief isEligibleStruct checks if we support a particular
+/// Struct Type.
+///
+/// \returns true if the Struct Type is supported.
+static bool isEligibleStruct(const Type *T)
+{
+  if (!T->isStructTy())
+    return false;
+
+  const StructType *ST = dyn_cast<StructType>(T);
+  for (auto I = ST->element_begin(), IE = ST->element_end(); I!= IE; ++I)
+    if ( !isScalarType(*I) && !isEligibleArray(*I) && !isEligibleStruct(*I) )
+      return false;
+
+  return true;
+}
+
+/// \brief isEligibleArray checks if we support a particular
+/// Array Type.
+///
+/// \returns true if the Array Type is supported.
+static bool isEligibleArray(const Type *T)
+{
+  if (!T->isArrayTy())
+    return false;
+  const Type *AT = dyn_cast<ArrayType>(T);
+  const Type *ET = AT->getArrayElementType();
+  return isScalarType(ET) || isEligibleStruct(ET);
+}
+
+
+/// \brief inspectGlobalVariable checks if we support a particular
+/// Global Variable Type.
+///
+/// We support any type that is constructed from scalars, arrays
+/// and structs. We do not support any pointer type.
+///
+/// \returns true if the Global Variable Type is supported.
+static bool inspectGlobalVariable(const Value *V)
+{
+  if (auto GV = dyn_cast<GlobalVariable>(V)) {
+    Type *T = GV->getType()->getPointerElementType();
+
+    //We only support instruction, operator users
+    for (auto I = GV->user_begin(), IE = GV->user_end(); I!= IE; ++I)
+    {
+      if (!isa<Instruction>(*I) && !isa<ConstantExpr>(*I))
+        return false;
+    }
+
+    return isScalarType(T) || isEligibleStruct(T) || isEligibleArray(T);
+  }
+  return false;
+}
+
+/// \brief inspectInstruction checks if we support a particular instruction
+/// for offloading.
+///
+/// We do not support Atomic Operations, VAArg operations, function calls
+/// and exceptions. However, we do support MemIntrinsics.
+///
+/// \returns true if the Instruction is supported.
+static bool inspectInstruction(const Instruction &I)
+{
+  //We do support MemIntrinsic Calls, which they are generated
+  //either by the user or llvm passes.
+  if ( isa<MemIntrinsic>(I) )
+    return true;
+
+  if ( isa<AtomicCmpXchgInst>(I) || isa<AtomicRMWInst>(I) ||
+      isa<VAArgInst>(I) || isa<CallInst>(I) ||
+      isa<InvokeInst>(I) || isa<ResumeInst>(I) )
+    return false;
+
+  //Otherwise the instruction is supported.
+  //alloca is supported. We can safely assume all the
+  //architectures we offload support stack allocations.
+  return true;
+}
+
+/// \brief inspectBasicBlock checks if we support a particular Basic
+/// Block for offloading.
+///
+/// We do not support Basic Blocks that contain Atomic Operations,
+/// VAArg operations, function calls and exceptions. However, we
+/// do support MemIntrinsics.
+///
+/// \returns true if the Basic Block is supported.
+static bool inspectBasicBlock(const BasicBlock &BB)
+{
+  return std::all_of(BB.begin(), BB.end(), inspectInstruction);
+}
+
+/// \brief analyzeMemRefValue analyzes a Memory reference value
+/// that is retrieved by the Alias Analysis and strips pointer
+/// casts, all zero geps and In Bounds Offsets.
+///
+/// \returns the Value after stripping.
+static Value *analyzeMemRefValue(Value *V)
+{
+  Value *OV = nullptr;
+  while (OV!= V) {
+    OV = V;
+    //if ( auto *BC = dyn_cast<BitCastInst>(P) )
+    V = V->stripPointerCasts();
+    V = V->stripInBoundsConstantOffsets();
+    V = V->stripInBoundsOffsets();
+  }
+
+  return V;
+}
+
+/// \brief resolveIfPHINode retrieves a Memory Reference
+/// Value out of Loop constructs if the input param V
+/// is a PHINode.
+///
+/// \returns the Memory Reference Value if V is a PhiNode,
+/// otherwise the original Value V.
+static Value *resolveIfPHINode(Value *V)
+{
+  if (!isa<PHINode>(V))
+    return V;
+
+  PHINode *P = dyn_cast<PHINode>(V);
+  if (P->getNumIncomingValues()!= 2)
+    return P;
+
+  Value *IV = analyzeMemRefValue(P->getIncomingValue(0));
+  if (IV!= P) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(IV);
+    if (!GEP)
+      return P;
+    if (GEP->getPointerOperand()!= P)
+      return P;
+  }
+
+  return resolveIfPHINode( analyzeMemRefValue( P->getIncomingValue(1) ) );
+}
+
+/// \brief inspectMemAccess investigates if we support a particular memory
+/// load or store instruction.
+///
+/// If the memory operation is supported, we track the memory segment that
+/// the instruction accesses. We track memory segments passed by the
+/// function interface (InterfaceMemRefs) and global variables (GlobalMemRefs).
+///
+/// \returns true if the memory access is supported.
+static bool inspectMemAccess(const Instruction &I, AliasAnalysis *AA,
+                             Function *F, WorkloadMemRefSet &InterfaceMemRefs,
+                             WorkloadMemRefSet &GlobalMemRefs )
+{
+  const LoadInst *LI = dyn_cast<LoadInst>(&I);
+  const StoreInst *SI = dyn_cast<StoreInst>(&I);
+
+  //If the instruction is not a load or store we do not bother
+  //analyze further. We return true.
+  if ( !LI && !SI )
+    return true;
+
+  WorkloadMemRef Mref;
+
+  //We use the Alias Analysis to reason about the memory segment.
+  if (LI)
+    Mref = MemoryLocation::get(LI);
+  else
+    Mref = MemoryLocation::get(SI);
+
+  Value *P = const_cast<Value *>(Mref.Ptr);
+  P = analyzeMemRefValue(P);
+  P = resolveIfPHINode(P);
+  Mref.Ptr = P;
+
+  //Check if the memory segment is passed by the function interface.
+  if (auto A = dyn_cast<Argument>(Mref.Ptr)) {
+    if (A->getParent() == F){
+      InterfaceMemRefs.insert(Mref);
+      return true;
+    }
+  }
+
+  //We check if the memory segment is a global variable
+  if (inspectGlobalVariable(Mref.Ptr)) {
+    GlobalMemRefs.insert(Mref);
+    return true;
+  }
+
+  //We check if the memory segment is a stack allocation
+  if (isa<AllocaInst>(Mref.Ptr))
+    return true;
+
+  //Unable to detect the origin of the memory segment
+  DEBUG(dbgs()<<"Analysis fails (Mem Reference): "<<*(Mref.Ptr)<<"\n" );
+  return false;
+}
+
+/// \brief inspectBasicBlockMemAccess investigates if we support
+/// the memory accesses of a Basic Block.
+///
+/// If the memory operation are supported, we track the memory
+/// segments that the operations  access. We track memory
+/// segments passed by the function interface (InterfaceMemRefs)
+/// and global variables (GlobalMemRefs).
+///
+/// \returns true if the memory access is supported.
+static bool inspectBasicBlockMemAccess(const BasicBlock &BB,
+                                       AliasAnalysis *AA,
+                                       Function *F,
+                                       WorkloadMemRefSet &InterfaceMemRefs,
+                                       WorkloadMemRefSet &GlobalMemRefs)
+{
+  return std::all_of(BB.begin(), BB.end(),
+                     std::bind(inspectMemAccess,
+                               std::placeholders::_1, AA,
+                               F, std::ref(InterfaceMemRefs),
+                               std::ref(GlobalMemRefs)));
+}
+
+/// \brief It performs Code and Memory Reference Analysis
+/// on Function F. If the Function is eligible, a Workload
+/// Information (Function Type) entry is inserted in the
+/// EligibleFunctionWorkloadMap.
+///
+/// We analyze every instruction and memory access. If
+/// the Function is eligible we keep track of it in
+/// EligibleFunctionWorkloadMap.
+///
+/// \returns true if the Function F is eligible.
+bool WorkloadAnalysis::analyzeCodeMemRefEligibility(Function *F,
+  FunctionWorkloadT &EligibleFunctionWorkloadMap)
+{
+  WorkloadInfo WI;
+  if (!std::all_of(F->begin(), F->end(), inspectBasicBlock))
+    return false;
+
+  DEBUG(dbgs()<<"Code Analysis: Function: "<<F->getName()<<
+    " completed successully\n");
+
+  if (!std::all_of(F->begin(), F->end(),
+                   std::bind(inspectBasicBlockMemAccess,
+                             std::placeholders::_1, AA, F,
+                             std::ref(WI.InterfaceMemRefs),
+                             std::ref(WI.GlobalMemRefs))))
+    return false;
+
+  DEBUG(dbgs()<<"Mem Ref Analysis: Function: "<<F->getName()<<
+    " completed successully\n");
+
+  WI.F = F;
+
+  if (WI.GlobalMemRefs.size())
+    WI.type = WorkloadType::FunctionWithGlobals;
+  else
+    WI.type = WorkloadType::FunctionNoGlobals;
+  WorkloadInfo *DWI = new WorkloadInfo( std::move(WI) );
+  EligibleFunctionWorkloadMap.insert( std::make_pair(F, DWI) );
+  return true;
+}
+
+/// \brief It performs Code and Memory Reference Analysis
+/// on Loop L. If the Loop is eligible, a Workload
+/// Information (Loop Type) entry is inserted in the
+/// EligibleLoopWorkloadMap.
+///
+/// We analyze every instruction and memory access. If
+/// the Loop is eligible we keep track of it in
+/// EligibleLoopWorkloadMap.
+///
+/// \returns true if the Loop L is eligible.
+bool WorkloadAnalysis::analyzeCodeMemRefEligibility(Loop *L,
+    LoopWorkloadMapT  &EligibleLoopWorkloadMap)
+{
+  WorkloadInfo WI;
+  if (!std::all_of(L->block_begin(), L->block_end(),
+        [](const BasicBlock *BB) { return inspectBasicBlock(*BB); } ))
+    return false;
+
+  Function *F = L->getHeader()->getParent();
+  DEBUG(dbgs()<<"Code Analysis: Function "<<F->getName()<<" Loop: "<<
+      L->getHeader()->getName()<<" completed successully\n");
+
+  auto inspect = [&](const BasicBlock *BB) {
+    return inspectBasicBlockMemAccess(*BB, AA, F,
+                                      std::ref(WI.InterfaceMemRefs),
+                                      std::ref(WI.GlobalMemRefs));
+  };
+
+  if (!std::all_of(L->block_begin(), L->block_end(), inspect ))
+    return false;
+
+  DEBUG(dbgs()<<"Mem Ref Analysis: Function: "<<F->getName()<<" Loop: "
+              <<L->getHeader()->getName()<<" completed successully\n");
+
+  WI.L = L;
+  WI.F = F;
+
+  if (WI.GlobalMemRefs.size())
+    WI.type = WorkloadType::LoopWithGlobals;
+  else
+    WI.type = WorkloadType::LoopNoGlobals;
+
+  //We use CodeExtractor to check if we can extract
+  //a Loop, we do not perform any code modification
+  //this is an analysis pass!
+  CodeExtractor CE(*DT, *L);
+  if (!CE.isEligible())
+    return false;
+
+  WorkloadInfo *DWI = new WorkloadInfo( std::move(WI) );
+  EligibleLoopWorkloadMap.insert( std::make_pair(L, DWI) );
+
+  return true;
+}
+
+
+/// \brief It performs Code And Memory Reference Analysis on a
+/// Function and its nested Loops.
+///
+/// We keep track of Eligible Functions and Loops in
+/// EligibleFunctionWorkloadMap and EligibleLoopWorkloadMap.
+void WorkloadAnalysis::analyzeCodeMemRef(Function &F,
+  FunctionWorkloadT
+  &EligibleFunctionWorkloadMap,
+  LoopWorkloadMapT &EligibleLoopWorkloadMap)
+{
+  //If it is a function declaration we skip
+  if (F.isDeclaration())
+    return;
+
+  //if the function returns a non scalar type, it is not
+  //supported
+  if (isa<CompositeType>(F.getReturnType()))
+    return;
+
+  AA = &getAnalysis<AliasAnalysis>();
+  LoopInfoCache.push_back(
+    std::move( getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo() ) );
+  LI = &LoopInfoCache.back();
+  DT = &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+
+
+  //We analyze the nested Loops of the function in a buttom up
+  //manner. If a loop is not eligible, we already know that
+  //its parent loops and the function are not eligible either.
+  //In this case we stop analyzing.
+  std::stack<Loop *> Loops;
+  std::set<Loop *> Skip;
+  std::set<Loop *> Visited;
+  bool skipFunction = false;
+
+  for (auto I = LI->begin(), IE = LI->end(); I!= IE; ++I)
+    Loops.push(*I);
+
+  while (Loops.size()){
+    Loop *L = Loops.top();
+    if (!Visited.count(L)){
+      for (auto I = L->begin(), IE = L->end(); I!= IE; ++I)
+        Loops.push(*I);
+      Visited.insert(L);
+    } else {
+      Loops.pop();
+
+      if (!Skip.count(L) &&
+          !analyzeCodeMemRefEligibility(L, EligibleLoopWorkloadMap) ) {
+        Loop *LP = L->getParentLoop();
+        while (LP){
+          Skip.insert(LP);
+          LP = LP->getParentLoop();
+        }
+        skipFunction = true;
+      }
+    }
+  }
+
+  if (!skipFunction)
+    analyzeCodeMemRefEligibility(&F, EligibleFunctionWorkloadMap);
+}
+
+/// \brief It maps the memory references of a function to
+/// the Memory Allocations used in a particular function call
+/// context.
+///
+/// A memory allocation can be a Global Variable, a
+/// dynamic allocation (malloc) or a stack allocation (alloca).
+static bool
+inspectFunctionCallMemAllocUse(const CallInst *CI,
+                               AliasAnalysis *AA,
+                               Function *Caller,
+                               WorkloadMemRefSet &FInterfaceMemRefs,
+                               SetVector<GlobalVariable *> &GlobalAllocations,
+                               SetVector<CallInst *> &MallocAllocations,
+                               SetVector<AllocaInst *> &AllocaAllocations,
+                               TargetLibraryInfo *TLI)
+{
+  for (auto I = FInterfaceMemRefs.begin(),
+      IE = FInterfaceMemRefs.end(); I!= IE; ++I) {
+
+    WorkloadMemRef FMref, Mref;
+    FMref = *I;
+    assert(isa<Argument>(FMref.Ptr));
+    const Argument *Arg = dyn_cast<Argument>(FMref.Ptr);
+
+    AliasAnalysis::ModRefResult Mask = AliasAnalysis::ModRef;
+    Mref = AA->getArgLocation(CI, Arg->getArgNo(), Mask);
+
+    Value *P = const_cast<Value *>(Mref.Ptr);
+    P = analyzeMemRefValue(P);
+    P = resolveIfPHINode(P);
+
+    //We check if we refer to a global variable
+    if (inspectGlobalVariable(P)){
+      GlobalVariable *GV = dyn_cast<GlobalVariable>(P);
+      GlobalAllocations.insert(GV);
+      continue;
+    }
+
+    //malloc allocations
+    if (CallInst *MC = extractMallocCall(P, TLI)) {
+      MallocAllocations.insert(MC);
+      continue;
+    }
+
+    //alloca instruction
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
+      AllocaAllocations.insert(AI);
+      continue;
+    }
+
+    //We check if the memory reference is passed from
+    //the function interface. we don't support it yet
+    if (auto A = dyn_cast<Argument>(P)){
+      if (A->getParent() == Caller){
+        return false;
+        //continue;
+      }
+    }
+    DEBUG(dbgs()<<"Analysis fails (Mem Allocation): "<<*P<<"\n");
+    //Unable to detect the origin of the memory
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief It performs Memory Allocation Use Analysis on
+/// Function F. It analyzes all the call sites of Function F.
+/// For each call site we map the memory references of the
+/// function with the Memory Allocations used in the particular
+/// function call context.
+///
+/// A memory allocation can be a Global Variable, a dynamic
+/// allocation (malloc) or a stack  allocation (alloca).
+void
+WorkloadAnalysis::analyzeMemAllocUseEligibility(Function *F,
+                                                WorkloadInfo *WI,
+                                                FunctionCallersSet &Callers)
+{
+  assert( WI->isFunction() );
+
+  WorkloadInfo CWI = *WI;
+
+  for (auto I = Callers.begin(), IE = Callers.end(); I!= IE; ++I){
+    //AliasAnalysis is a Group Pass interface, an AA implementation may be
+    //a function or module pass, we get the analysis explicitly for the
+    //function to guarantee correct data.
+    Value *V;
+    Function *CallerF;
+    std::tie(V, CallerF) = *I;
+
+    if (!V)
+      continue;
+    assert( isa<CallInst>(V) || isa<InvokeInst>(V) );
+
+    //we do not support code with exceptions for offloading
+    if (isa<InvokeInst>(V))
+      continue;
+
+    //We know it is a call instruction
+    CallInst *CI = dyn_cast<CallInst>(V);
+
+    WorkloadInfo CWI = *WI;
+    CWI.CI = CI;
+
+    auto insertGlobal = [&](const  WorkloadMemRef &I) {
+      assert( inspectGlobalVariable(I.Ptr) );
+      CWI.GlobalAllocations.insert(
+          dyn_cast<GlobalVariable>( const_cast<Value *>(I.Ptr) ) );
+    };
+
+    //global variable accesses
+    std::for_each(WI->GlobalMemRefs.begin(), WI->GlobalMemRefs.end(),
+                  insertGlobal);
+
+    if (inspectFunctionCallMemAllocUse(CI, AA, CallerF, WI->InterfaceMemRefs,
+                                       CWI.GlobalAllocations,
+                                       CWI.MallocAllocations,
+                                       CWI.AllocaAllocations, TLI)) {
+
+      CallWorkloadMap.insert(std::make_pair(CI,
+                                            new WorkloadInfo(std::move(CWI))));
+      ECalls[F].insert(CI);
+
+      DEBUG(dbgs()<<"Mem Allocation Analysis: Callee Function: "<<F->getName()
+        <<" Caller Function: "<<CallerF->getName()<<" completed successfully\n");
+    }
+    //else not eligible we skip it
+  }
+}
+
+/// \brief It performs Memory Allocation Use Analysis on Loop
+/// L. It analyzes all the call sites of its parent Function.
+/// For each call site we map the memory references of the
+/// Loop with the Memory Allocations used in the particular
+/// function call context. If we can successfully analyze
+/// all the call sites, this Loop is considered eligible.
+///
+/// A memory allocation can be a Global Variable, a dynamic
+/// allocation (malloc) or a stack  allocation (alloca).
+void
+WorkloadAnalysis::analyzeMemAllocUseEligibility(Loop *L,
+                                                WorkloadInfo *WI,
+                                                FunctionCallersSet &Callers)
+{
+  assert( WI->isLoop() );
+
+  WorkloadInfo CWI = *WI;
+
+  if (!Callers.size())
+    return;
+
+  for (auto I = Callers.begin(), IE = Callers.end(); I!= IE; ++I) {
+    //AliasAnalysis is a Group Pass interface, an AA implementation may be
+    //a function or module pass, we get the analysis explicitly for the
+    //function to guarantee correct data.
+    Value *V;
+    Function *CallerF;
+    std::tie(V, CallerF) = *I;
+
+    if (!V)
+      continue;
+
+    assert( isa<CallInst>(V) || isa<InvokeInst>(V) );
+
+    //we do not support code with exceptions for offloading
+    if (isa<InvokeInst>(V))
+      continue;
+
+    //We know it is a call instruction
+    CallInst *CI = dyn_cast<CallInst>(V);
+
+    auto insertGlobal = [&](const  WorkloadMemRef &I) {
+      assert( inspectGlobalVariable(I.Ptr) );
+      CWI.GlobalAllocations.insert(
+        dyn_cast<GlobalVariable>( const_cast<Value *>(I.Ptr) ) );
+    };
+
+    //global variable accesses
+    std::for_each(WI->GlobalMemRefs.begin(), WI->GlobalMemRefs.end(),
+                  insertGlobal);
+
+    // if we cannot reason for a partical call of the function we abort,
+    //the loop is not eligible
+    if (!inspectFunctionCallMemAllocUse(CI, AA, CallerF,
+                                        WI->InterfaceMemRefs,
+                                        CWI.GlobalAllocations,
+                                        CWI.MallocAllocations,
+                                        CWI.AllocaAllocations, TLI))
+      return;
+  }
+
+  LoopWorkloadMap.insert(
+    std::make_pair(WI->getLoop(), new WorkloadInfo(std::move(CWI)) ) );
+  ELoops[WI->getFunction()].insert(L);
+
+  DEBUG(dbgs()<<"Mem Allocation Analysis: Callee Function: "<<
+    WI->getFunction()->getName()<<" Loop: "<<
+    L->getHeader()->getName()<<" completed successfully\n");
+}
+
+
+/// \brief It reverses the call graph information for the functions
+/// we want to analyze their call sites.
+static void
+reverseCallGraphInfo(std::map<Function *, FunctionCallersSet > &CallInfo,
+                     const FunctionWorkloadT &EligibleFunctionWorkloadMap,
+                     const LoopWorkloadMapT &EligibleLoopWorkloadMap,
+                     CallGraph *CG)
+{
+  for (auto I = EligibleFunctionWorkloadMap.begin(),
+       IE = EligibleFunctionWorkloadMap.end(); I!= IE; ++I)
+    CallInfo[I->first];
+
+  for (auto I = EligibleLoopWorkloadMap.begin(),
+       IE = EligibleLoopWorkloadMap.end(); I!= IE; ++I)
+    CallInfo[I->first->getHeader()->getParent()];
+
+  WeakVH WV;
+  CallGraphNode *Callee;
+  CallGraphNode *ECN, *CEN;
+
+  ECN = CG->getExternalCallingNode();
+  CEN = CG->getCallsExternalNode();
+
+  for (auto I = CG->begin(), IE = CG->end(); I!= IE; ++I) {
+    if (I->second == ECN || I->second == CEN)
+      continue;
+    Function *CallerF = I->second->getFunction();
+    for (auto J = I->second->begin(), JE = I->second->end(); J!= JE; ++J) {
+      std::tie(WV, Callee) = *J;
+      Function *CalleeF = Callee->getFunction();
+      auto CI = CallInfo.find(CalleeF);
+      if (CI!= CallInfo.end())
+        CI->second.insert(std::make_tuple(WV, CallerF));
+    }
+  }
+}
+
+bool WorkloadAnalysis::runOnModule(Module &M)
+{
+  CG = &getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  // 1) We perform Code and Memory Reference Analysis on
+  // every Function definition and its nested Loops.
+  for (auto I = M.begin(), IE = M.end(); I!= IE; ++I)
+    analyzeCodeMemRef(*I, EligibleFunctionWorkloadMap,
+                      EligibleLoopWorkloadMap);
+
+  // We get function call sites in a convenient representation
+  std::map<Function *, FunctionCallersSet > CallInfo;
+  reverseCallGraphInfo(CallInfo, EligibleFunctionWorkloadMap,
+                       EligibleLoopWorkloadMap, CG);
+
+  // 2) We perform Memory Allocation Use Analysis on Functions that passed
+  // the first step, the Code and Memory Reference Analysis.
+  for (auto I = EligibleFunctionWorkloadMap.begin(),
+       IE = EligibleFunctionWorkloadMap.end(); I!= IE; ++I)
+    analyzeMemAllocUseEligibility(I->first, I->second, CallInfo[I->first]);
+
+  // 2) We perform Memory Allocation Use Analysis on Loops that passed
+  // the first step, the Code and Memory Reference Analysis.
+  for (auto I = EligibleLoopWorkloadMap.begin(),
+       IE = EligibleLoopWorkloadMap.end(); I!= IE; ++I)
+    analyzeMemAllocUseEligibility(I->first, I->second,
+                                  CallInfo[I->first->getHeader()->getParent()]);
+
+  return false;
+}
+
+template < class T>
+void deleteObjects(T &Workload)
+{
+  for (auto I = Workload.begin(), IE = Workload.end(); I!= IE; ++I)
+    delete I->second;
+  Workload.clear();
+}
+
+void WorkloadAnalysis::releaseMemory()
+{
+  deleteObjects(EligibleFunctionWorkloadMap);
+  deleteObjects(EligibleLoopWorkloadMap);
+
+  deleteObjects(CallWorkloadMap);
+  deleteObjects(LoopWorkloadMap);
+
+  ECalls.clear();
+  ELoops.clear();
+
+  std::for_each(GarbageCollector.begin(), GarbageCollector.end(),
+                [](WorkloadInfo *I) { delete I; } );
+
+  deleteObjects(CallWorkloadMap);
+  GarbageCollector.clear();
+}
+
+template < class T, class U >
+static void eraseWorkload(T &Workloads, const U &Key)
+{
+  auto I = Workloads.find(Key);
+  if (I == Workloads.end())
+    return;
+  delete I->second;
+  Workloads.erase(I);
+}
+
+
+bool WorkloadAnalysis::setForOffloading(WorkloadInfo *WI)
+{
+  if ( WI->isCall() && CallWorkloadMap.count(WI->getCall()) ) {
+    Function *F = WI->getFunction();
+    GarbageCollector.push_back(WI);
+    CallWorkloadMap.erase(WI->getCall());
+    //find and remove the loops of the function that are eligible
+    //for offloading.
+    std::set<Loop *> &Loops = ELoops[F];
+    for (auto I = Loops.begin(), IE = Loops.end(); I!= IE; ++I)
+      eraseWorkload(LoopWorkloadMap, *I);
+    ELoops.erase(F);
+
+    return true;
+  } else if ( WI->isLoop() &&  LoopWorkloadMap.count(WI->getLoop()) ) {
+    GarbageCollector.push_back(WI);
+    LoopWorkloadMap.erase(WI->getLoop());
+    Loop *L = WI->getLoop();
+
+    //Remove parent function calls from the eligible workloads
+    Function *PF = L->getHeader()->getParent();
+    std::set<CallInst *> &Calls = ECalls[PF];
+    for (auto I = Calls.begin(), IE = Calls.end(); I!= IE; ++I)
+      eraseWorkload(CallWorkloadMap, *I);
+    ECalls.erase(PF);
+
+    //Remove eligibles loops for the same function
+    std::set<Loop *> &Loops = ELoops[PF];
+    for (auto I = Loops.begin(), IE = Loops.end(); I!= IE; ++I)
+      eraseWorkload(LoopWorkloadMap, *I);
+    ELoops.erase(PF);
+
+    return true;
+  }
+  return false;
+}
+
+bool WorkloadAnalysis::MutateLoopToCallWorkload(WorkloadInfo *WI,
+                                                Function *F, CallInst *CI)
+{
+  bool RV;
+  WI->F = F;
+  WI->CI = CI;
+  if (WI->type == WorkloadType::LoopNoGlobals)
+    WI->type = WorkloadType::CallNoGlobals;
+  else
+    WI->type = WorkloadType::CallWithGlobals;
+
+  WI->InterfaceMemRefs.clear();
+  WI->GlobalMemRefs.clear();
+
+  RV = std::all_of(F->begin(), F->end(),
+                 std::bind(inspectBasicBlockMemAccess,
+                           std::placeholders::_1,
+                           AA, F, std::ref(WI->InterfaceMemRefs),
+                           std::ref(WI->GlobalMemRefs)));
+
+  assert(RV);
+  return RV;
+}
+
+/// \brief It orders the values alphabetically for printing.
+static void printOrdered(llvm::raw_ostream &O,
+                         std::vector<std::string> &Names)
+{
+  std::sort(Names.begin(), Names.end());
+  for (auto I = Names.begin(), IE=Names.end(); I != IE; ++I)
+    O<<*I<<" ";
+}
+
+//reportMemRef reports the Memory References of a Workload
+static void reportMemRef(llvm::raw_ostream &O, WorkloadInfo *WI)
+{
+  std::vector<std::string> Names;
+
+  O<<"\t\tInterfaceMemRefs: ";
+  for (auto J = WI->getInterfaceMemRefs().begin(),
+       JE = WI->getInterfaceMemRefs().end(); J!= JE; ++J)
+    Names.push_back(J->Ptr->getName());
+
+  printOrdered(O,Names);
+  Names.clear();
+
+  O<<"\n\t\tGlobalMemRefs: ";
+  for (auto J = WI->getGlobalMemRefs().begin(),
+       JE = WI->getGlobalMemRefs().end(); J!= JE; ++J)
+    Names.push_back(J->Ptr->getName());
+
+  printOrdered(O,Names);
+  O<<"\n";
+}
+
+//reportAllocaRef reports the Memory Allocations accessed by a Workload
+static void reportAllocaRef(llvm::raw_ostream &O, WorkloadInfo *WI)
+{
+  std::vector<std::string> Names;
+
+  O<<"\t\tGlobalAllocations: ";
+  for (auto J = WI->getGlobalAllocations().begin(),
+       JE = WI->getGlobalAllocations().end(); J!= JE; ++J)
+    Names.push_back((*J)->getName());
+
+  printOrdered(O,Names);
+  Names.clear();
+
+  O<<"\n\t\tMallocAllocations: ";
+  for (auto J = WI->getMallocAllocations().begin(),
+       JE = WI->getMallocAllocations().end(); J!= JE; ++J)
+    Names.push_back((*J)->getName());
+
+  printOrdered(O,Names);
+  Names.clear();
+
+  O<<"\n\t\tAllocaAllocations: ";
+  for (auto J = WI->getAllocaAllocations().begin(),
+       JE = WI->getAllocaAllocations().end(); J!= JE; ++J)
+    Names.push_back((*J)->getName());
+
+  printOrdered(O,Names);
+  O<<"\n";
+}
+
+
+void WorkloadAnalysis::print(llvm::raw_ostream &O, const Module *M) const
+{
+  typedef std::tuple<std::string, WorkloadInfo *> Info;
+  std::vector<Info> PrintInfo;
+  auto InfoCmp = [](const Info &A, const Info &B) {
+    return std::get<0>(A).compare(std::get<0>(B)) < 0;
+  };
+
+  O<<"Eligible Functions (Code and Memory Reference Analysis):\n";
+  for (auto I = EligibleFunctionWorkloadMap.begin(),
+       IE = EligibleFunctionWorkloadMap.end(); I!= IE; ++I) {
+    PrintInfo.push_back(std::make_tuple(I->first->getName(), I->second));
+  }
+
+  std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp);
+  for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) {
+    O<<"\t"<<std::get<0>(*I)<<":\n";
+    reportMemRef(O, std::get<1>(*I));
+  }
+  O<<"-----------------------------------\n";
+
+  O<<"Eligible Loops (Code and Memory Reference Analysis):\n";
+  PrintInfo.clear();
+
+  for (auto I = EligibleLoopWorkloadMap.begin(),
+       IE = EligibleLoopWorkloadMap.end(); I!= IE; ++I) {
+    std::string Key = I->first->getHeader()->getName();
+    Key += ", Parent Function: ";
+    Key += I->first->getHeader()->getParent()->getName();
+    PrintInfo.push_back( std::make_pair(Key, I->second) );
+  }
+
+  std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp);
+  for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) {
+    O<<"\t"<<std::get<0>(*I)<<":\n";
+    reportMemRef(O, std::get<1>(*I));
+  }
+  O<<"-----------------------------------\n";
+
+  O<<"Eligible Functions (Memory Allocation Use Analysis):\n";
+  PrintInfo.clear();
+
+  for (auto I = CallWorkloadMap.begin(),
+       IE = CallWorkloadMap.end(); I!= IE; ++I) {
+    std::string Key = I->first->getName();
+    Key += ", Callee: ";
+    Key += I->second->getFunction()->getName();
+    Key += " Caller: ";
+    Key += I->first->getParent()->getParent()->getName();
+    PrintInfo.push_back( std::make_pair(Key, I->second) );
+  }
+
+  std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp);
+  for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) {
+    O<<"\t"<<std::get<0>(*I)<<":\n";
+    reportAllocaRef(O, std::get<1>(*I));
+  }
+  O<<"-----------------------------------\n";
+
+  O<<"Eligible Loops (Memory Allocation Use Analysis):\n";
+  PrintInfo.clear();
+
+  for (auto I = LoopWorkloadMap.begin(),
+       IE = LoopWorkloadMap.end(); I!= IE; ++I) {
+    std::string Key = I->first->getHeader()->getName();
+    Key += ", Parent Function: ";
+    Key += I->first->getHeader()->getParent()->getName();
+    PrintInfo.push_back( std::make_pair(Key, I->second) );
+  }
+
+  std::sort(PrintInfo.begin(), PrintInfo.end(), InfoCmp);
+  for (auto I = PrintInfo.begin(), IE = PrintInfo.end(); I != IE; ++I) {
+    O<<"\t"<<std::get<0>(*I)<<":\n";
+    reportAllocaRef(O, std::get<1>(*I));
+  }
+  O<<"-----------------------------------\n";
+}
+
+char WorkloadAnalysis::ID = 1; //just because everyone sets it to 0
+INITIALIZE_PASS_BEGIN(WorkloadAnalysis,
+    "hexe-analysis", "Hexe Workload Analysis", false, true)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WorkloadAnalysis, "hexe-analysis",
+    "Hexe Workload Analysis", false, true)
+
+ModulePass *createWorkloadAnalysisPass()
+{
+  return new WorkloadAnalysis();
+}
+
+WorkloadAnalysis::WorkloadAnalysis():ModulePass(ID), AA(nullptr),
+  LI(nullptr), DT(nullptr), CG(nullptr)
+{
+  initializeWorkloadAnalysisPass(*PassRegistry::getPassRegistry());
+}
+
+void WorkloadAnalysis::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<CallGraphWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
Index: lib/Transforms/Hexe/WorkloadExtractor.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/WorkloadExtractor.cpp
@@ -0,0 +1,1222 @@
+// === WorkloadExtractor.cpp - Heterogeneous Execution Engine -*- C++ -*-=== //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// === ------------------------------------------------------------------=== //
+/// \file
+/// Implementation of the Workload Extraction Utilities and Pass of the
+/// Heterogeneous Execution Engine. Please read the header file documentation
+/// for high level description.
+// === ------------------------------------------------------------------=== //
+
+
+#include "llvm/Transforms/Hexe/WorkloadAnalysis.h"
+#include "llvm/Transforms/Hexe/WorkloadExtractor.h"
+#include "llvm/Transforms/Hexe/InitializeHexePasses.h"
+#include "llvm/Transforms/Hexe/Hexe.h"
+#include "llvm/Transforms/Hexe/Utils.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <queue>
+
+
+#define DEBUG_TYPE "hwe"
+
+using namespace llvm;
+HexeWorkload::HexeWorkload(LLVMContext &C, StringRef MSID)
+  : C(C), MSID(MSID), DL(nullptr), TargetTriple(nullptr), M(nullptr)
+{
+  nextFunctionID = 0;
+  M = new Module(MSID, C);
+}
+
+void HexeWorkload::setDLandTriple(const DataLayout &DL,
+                                  const Triple &TargetTriple)
+{
+  delete this->DL;
+  this->DL = new DataLayout(DL);
+  delete this->TargetTriple;
+  this->TargetTriple = new Triple(TargetTriple);
+}
+
+void HexeWorkload::writeFile(StringRef Filename)
+{
+  assert( DL && TargetTriple );
+  //writes Hexe Metadata on the module
+  writeHexeMetadata(M, nextFunctionID, Mapping, *TargetTriple);
+
+  //keeps the original host DataLayout
+  //sets special Hexe Target Triple
+  M->setDataLayout(DL->getStringRepresentation());
+  M->setTargetTriple("hexe-unknown--unknown");
+
+  //writes the module to File
+  std::error_code EC;
+  raw_fd_ostream fs(Filename, EC, sys::fs::F_None);
+  WriteBitcodeToFile(M, fs);
+}
+
+void HexeWorkload::writeModule(Module *M)
+{
+  assert( DL && TargetTriple );
+
+  //writes Hexe Metadata on the module
+  writeHexeMetadata(M, nextFunctionID, Mapping, *TargetTriple);
+
+  //keeps the original host DataLayout
+  //sets special Hexe Target Triple
+  M->setDataLayout(DL->getStringRepresentation());
+  M->setTargetTriple("hexe-unknown--unknown");
+}
+
+
+HexeFunctionStructInfoT
+HexeWorkload::addFunctionPrototype(Function *F, FunctionType *FT)
+{
+  assert( Mapping.find(F) == Mapping.end() );
+
+  Function *EF =
+    Function::Create(FT, GlobalValue::AvailableExternallyLinkage,
+                     F->getName(), M);
+  StructType *ST = getCompactFunctionStruct(FT, C);
+  auto T = std::make_tuple(EF, nextFunctionID++, ST);
+  Mapping.insert(std::make_pair(F, T));
+
+  return T;
+}
+
+
+/// In principal, we do not support offloading of loops or
+/// functions that contain function calls. However, we
+/// do an exception for Memory Intrinsics. This group
+/// of intrinsics performs standard memory operations
+/// which are supported across the various LLVM
+/// Targets. This function injects valid Function
+/// Declarations for the Memory intrinsics that are
+// used in the Hexe Module.
+void HexeWorkload::validateSupportedFunctionCalls()
+{
+  std::map<Function *, std::vector<CallInst *> > Registry;
+
+  //function iterator
+  for (auto I = M->begin(), IE = M->end(); I!= IE; ++I)
+    //BasicBlock iterator
+    for (auto J = I->begin(), JE = I->end(); J!= JE; ++J)
+      // Instruction Iterator
+      for (auto K = J->begin(), KE = J->end(); K!= KE; ++K)
+        if ( auto CI = dyn_cast<CallInst>(K) )
+          Registry[CI->getCalledFunction()].push_back(CI);
+
+
+  for (auto I = Registry.begin(), IE = Registry.end(); I!= IE; ++I) {
+    Function *LF = Function::Create(I->first->getFunctionType(),
+                                    I->first->getLinkage(),
+                                    I->first->getName(), M);
+    LF->setCallingConv(I->first->getCallingConv());
+    LF->setAttributes(I->first->getAttributes());
+
+    for (auto J = I->second.begin(), JE = I->second.end(); J!= JE; ++J)
+      (*J)->setCalledFunction(LF);
+  }
+}
+
+
+/// \brief It detects the Call Instruction in Function \param
+/// Caller that calls the Function \param Callee.
+///
+/// When we extract a Loop it gets converted into a function. We
+/// use findCallInst to detect the call instruction in the original
+/// Function that calls the extracted Loop function.
+static CallInst *findCallInst(Function *Caller, Function *Callee)
+{
+  for (auto I = Caller->begin(), IE = Caller->end(); I!= IE; ++I) {
+    for (auto J = I->begin(), JE = I->end(); J!= JE; ++J) {
+      if ( CallInst *CI = dyn_cast<CallInst>(J) ) {
+        if (CI->getCalledFunction() == Callee)
+          return CI;
+      }
+    }
+  }
+
+  assert( 0 && "We shouldn't reach here" );
+  return nullptr;
+}
+
+/// \brief It extracts a Loop and creates a new function that
+/// solely contains the Loop. It then contacts the Workload
+/// Analysis to update the Workload Info for this Workload.
+///
+/// Use case: If we decide to offload a Loop workload, we
+/// convert it to a Function Call Workload and we then
+/// treat it as a Function Call Offloading.
+bool WorkloadExtractUtil::loopToFunction(WorkloadInfo *WI,
+                                         WorkloadInfo *CW,
+                                         DominatorTree *DT)
+{
+  //we extract the loop as a function
+  Loop *L = WI->getLoop();
+  Function *PF = L->getHeader()->getParent();
+  CodeExtractor CE(*DT, *L, false);
+  assert( CE.isEligible() );
+  Function *LF = CE.extractCodeRegion();
+  DT->verifyDomTree();
+
+  CallInst *CI = findCallInst(PF, LF);
+  *CW = *WI;
+
+  // We update the Workload Info contents
+  WA->MutateLoopToCallWorkload(CW, LF, CI);
+  return true;
+}
+
+
+/// \brief This function clones a function to the Hexe Workload Module.
+////
+/// We perform the following on the cloned Function:
+/// If the function accesses Global Variables we need to adjust the
+/// function interface. We append an argument for every Global Variable
+/// the function accesses. The argument has the same type as the Global
+/// Variable. We update all the Global Variable references  to point
+/// to the newly added arguments.
+/// The original function remains intact.
+Function *WorkloadExtractUtil::cloneOrGetHexeFunction(WorkloadInfo *WI)
+{
+  // If a Hexe function has already been cloned we return the Hexe
+  // function.
+  if (auto I = HW->getMapping(WI->getFunction()))
+    return std::get<0>( *I );
+
+  Function *OF = WI->getFunction();
+  FunctionType *OFT = OF->getFunctionType();
+  FunctionType *EFT;
+  std::queue<const Value *> SValues;
+
+  // If the function accesses Global Variables we need to adjust the
+  // function interface. We append an argument for every Global Variable
+  // the function accesses. The argument should have the same type
+  // as the Global Variable.
+  if (WI->getGlobalMemRefs().size()) {
+    SmallVector<Type *, 8> ArgTypes;
+    for (auto I = OFT->param_begin(), IE = OFT->param_end(); I!= IE; ++I)
+      ArgTypes.push_back(*I);
+
+    WorkloadMemRefSet GV = WI->getGlobalMemRefs();
+    for (auto I = GV.begin(), IE = GV.end(); I!= IE; ++I){
+      ArgTypes.push_back( I->Ptr->getType() );
+      SValues.push(I->Ptr);
+    }
+
+    EFT = FunctionType::get( OFT->getReturnType(), ArgTypes, false);
+  } else
+    EFT = OFT;
+
+  // We add a new Function Prototype on Hexe Workload.
+  Function *EF;
+  std::tie(EF, std::ignore, std::ignore) = HW->addFunctionPrototype(OF, EFT);
+
+  // We clone the function body. We update all the Global Variable references
+  // to point to the newly added arguments.
+  ValueToValueMapTy VMap;
+  Function::ArgumentListType &OFArgs = OF->getArgumentList();
+  Function::ArgumentListType &EFArgs = EF->getArgumentList();
+
+  auto J = EFArgs.begin();
+  for (auto I = OFArgs.begin(), IE = OFArgs.end(); I!= IE; ++I, ++J)
+    VMap.insert( std::make_pair( I, WeakVH(J) ) );
+
+  for (auto JE = EFArgs.end(); J!= JE; ++J) {
+    VMap.insert( std::make_pair(SValues.front(), WeakVH(J)) );
+    SValues.pop();
+  }
+
+  SmallVector<ReturnInst*, 3> Returns;
+  CloneFunctionInto(EF, OF, VMap, false, Returns, "");
+
+  return EF;
+}
+
+// Marshalling of function call arguments and global variables
+//
+// Each time we need to dispatch a workload for computation on the
+// accelerator we need to "pack" the call arguments and global
+// variable references on a buffer. The reason for doing that
+// is that the runtime library function calls that perform the
+// offloading provide a generic interface, which is agnostic
+// of the particular Workload. At this step we treat pointer
+// arguments as scalars and we simply copy their values. The
+// runtime is responsible to convert them to valid values  in
+// the execution context of the accelerator.
+// We represent the dispatch data in the following format in
+// the buffer:
+//   ___________________
+//  | Call Arg 0        |
+//  | Call Arg 1        |
+//  | Call Arg 2        |
+//  |   ...             |
+//  | Global Var Ref 0  |
+//  | Global Var Ref 1  |
+//  |   ....            |
+//  | Return Value      |
+//  |___________________|
+
+/// \brief this function marshals the Function Call Arguments
+/// and the Global Variable references of the function call
+/// on a buffer. At this point we treat pointer values as
+/// scalars where just copy the value. This operation is
+/// performed before dispatching a workload execution on the
+/// accelerator.
+static void prologMarshalling(Instruction *II, Value *StructPtr,
+                              StructType *ST, WorkloadInfo *WI,
+                              CallInst *OCI, LLVMContext &C)
+{
+  //lambda function for writing a particular argument on the buffer
+  ConstantInt *ZeroIndex = ConstantInt::get(Type::getInt32Ty(C), 0);
+  auto gepstore = [ &StructPtr, &C, &II, &ST, &ZeroIndex ]
+    ( unsigned Indx, Value *In) {
+    Value *indices[] = { ZeroIndex,
+                         ConstantInt::get(Type::getInt32Ty(C), Indx) };
+    auto P =
+      GetElementPtrInst::CreateInBounds(StructPtr,
+                                        ArrayRef<Value *>(indices, 2), "", II);
+    return new StoreInst(In, P, II);
+  };
+
+  unsigned Indx = 0;
+
+  //we marshal the original arguments
+  for (unsigned I = 0, IE = OCI->getNumArgOperands(); I!= IE; ++I)
+    gepstore(Indx++, OCI->getArgOperand(I));
+
+  //We append the Global Variable references
+  for (auto I = WI->getGlobalMemRefs().begin(),
+       IE = WI->getGlobalMemRefs().end(); I!= IE; ++I)
+    gepstore(Indx++, const_cast<Value *>(I->Ptr));
+}
+
+/// \brief This function is the complement of prologMarshalling. It
+/// is called after the completion of a workload dispatch and
+/// reads the return values back from the buffer.
+static Instruction *epilogueMarshalling(Instruction *II, Value *StructPtr,
+                                        StructType *ST, WorkloadInfo *WI,
+                                        CallInst *OCI, LLVMContext &C)
+{
+  //lambda function for reading a value from the buffer
+  ConstantInt *ZeroIndex = ConstantInt::get(Type::getInt32Ty(C), 0);
+   auto gepload = [ &StructPtr, &C, &II, &ST, &ZeroIndex ]
+    ( unsigned Indx) {
+    Value *indices[] = { ZeroIndex,
+                         ConstantInt::get(Type::getInt32Ty(C), Indx) };
+    auto P =
+      GetElementPtrInst::CreateInBounds(StructPtr,
+                                        ArrayRef<Value *>(indices, 2), "", II);
+    return new LoadInst(P, "", II);
+  };
+
+  //if the function type returns a value we read it from the buffer.
+  FunctionType *FT = OCI->getCalledFunction()->getFunctionType();
+  if (FT->getReturnType()->isVoidTy())
+    return nullptr;
+
+  return gepload( ST->getNumElements() -1 );
+}
+
+
+
+// Passing Memory Access Information to the runtime library.
+//
+// We pass information about the memory accesses of the Workload
+// to the runtime. The runtime may use this information for a number
+// of operations such as enforce coherency, perform runtime checks
+// or data transfers.
+//
+// The following struct type provides information for a single memory
+// access.
+// struct MemAccessInfo{
+//  void *ptr;  //a pointer to the accessed memory segment.
+//  unsigned size; //a static analysis estimation about the segment size
+//  unsigned arg_order; // the argument of the Hexe Function (the cloned,
+//                      // extracted function version) that gets mapped to
+//                      // this segment.
+//  access_mode; // the access mode, read 0, write 1, readwrite 3;
+// };
+// We provide an array of this struct to the runtime. Each entry represents
+// a different memory segment.
+//
+//
+
+
+/// \brief It allocates the required memory for the Memory Access Info Array
+/// on the stack.
+static Value *MemAccessInfoBufferAlloc(Instruction *II, WorkloadInfo *WI,
+                                       LLVMContext &C,
+                                       StructType *MemoryAccessInfoT)
+{
+  size_t N = WI->getInterfaceMemRefs().size() + WI->getGlobalMemRefs().size();
+
+  if (!N)
+    return ConstantPointerNull::get(PointerType::getUnqual(MemoryAccessInfoT));
+
+  Value *NV = ConstantInt::get(Type::getInt32Ty(C), N);
+  return new AllocaInst(MemoryAccessInfoT, NV, "", II);
+}
+
+/// \brief It writes the data of a particular Memory Access Info Array element.
+static Instruction *writeMemAccessInfo(Instruction *II, LLVMContext &C,
+                                       Value *StructArP, Value *MemPtr,
+                                       unsigned MemSize, unsigned ArrayIndex,
+                                       unsigned ArgOrder,
+                                       MemAccessInfoAccessMode Mode)
+{
+  // lambda function that writes a single field of the MemAccessInfo
+  // struct at a time.
+  auto gepstore = [ &StructArP, &C, &II ]
+    ( unsigned Indx0, unsigned Indx1, Value *In) {
+    Value *indices[] = {
+      ConstantInt::get(Type::getInt32Ty(C), Indx0),
+      ConstantInt::get(Type::getInt32Ty(C), Indx1)
+    };
+    auto P =
+      GetElementPtrInst::CreateInBounds(StructArP,
+                                        ArrayRef<Value *>(indices, 2), "", II);
+    return new StoreInst(In, P, II);
+  };
+
+  //set ptr
+  Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) );
+  Value *PtrC = new BitCastInst(MemPtr, VoidPtr, "", II);
+  gepstore(ArrayIndex, 0, PtrC);
+  //set size
+  Value *MemSizeV = ConstantInt::get(Type::getInt32Ty(C), MemSize);
+  gepstore(ArrayIndex, 1, MemSizeV);
+  //set arg_order
+  Value *ArgOrderV = ConstantInt::get(Type::getInt32Ty(C), ArgOrder);
+  gepstore(ArrayIndex, 2, ArgOrderV);
+  //set access mode
+  Value *AccessModeV = ConstantInt::get(Type::getInt8Ty(C),
+                                      static_cast<unsigned>(Mode) );
+  return gepstore(ArrayIndex, 3, AccessModeV);
+}
+
+/// \brief It allocates and sets the Memory Access Info Array
+static Value*
+allocateAndSetMemAccessInfoArray(Instruction *II, CallInst *CI,
+                                 WorkloadInfo *WI, LLVMContext &C,
+                                 StructType *MemoryAccessInfoT)
+{
+  //array allocation
+  Value *StructArray = MemAccessInfoBufferAlloc(II, WI, C, MemoryAccessInfoT);
+
+  if (!( WI->getInterfaceMemRefs().size() + WI->getGlobalMemRefs().size()))
+    return StructArray;
+
+  unsigned ArgOrder = 0;
+  unsigned ArrayIndex = 0;
+
+  // pass information for memory segments provided by the original function
+  // interface
+  auto &FMRefs = WI->getInterfaceMemRefs();
+  for (auto I = WI->getFunction()->getArgumentList().begin(),
+       IE = WI->getFunction()->getArgumentList().end(); I!= IE; ++I) {
+    WorkloadMemRef Query;
+    Query.Ptr = I;
+    auto MR = FMRefs.find(Query);
+    if ( MR!= FMRefs.end() ){
+      writeMemAccessInfo(II, C, StructArray,
+                         CI->getArgOperand(ArgOrder),
+                         MR->Size, ArrayIndex++,
+                         ArgOrder, MemAccessInfoAccessMode::ReadWrite);
+    }
+    ++ArgOrder;
+  }
+
+  //pass information for global memory segments.
+  for (auto I = WI->getGlobalMemRefs().begin(),
+       IE = WI->getGlobalMemRefs().end(); I!= IE; ++I) {
+    writeMemAccessInfo(II, C, StructArray,
+                       const_cast<Value *>(I->Ptr), I->Size, ArrayIndex++,
+                       ArgOrder++, MemAccessInfoAccessMode::ReadWrite);
+  }
+
+  return StructArray;
+}
+
+/// \brief This functions generates the code that performs th Workload
+/// offloading.
+///
+/// It does the following:
+/// a) Allocates and sets the Memory Access Info Array
+/// b) Enforces coherency by calling __hexe_enforce_coherency
+/// c) Marshals the call arguments
+/// d) Performs the offloading by calling __hexe_dispatch
+/// e) Waits for its completion by calling __hexe_event_wait
+/// f) Enforces coherency by calling __hexe_enforce_coherency
+/// g) Reads the return value if any
+Instruction *
+WorkloadExtractUtil::marshalAndOffload(Instruction *II, CallInst *CI,
+                                       WorkloadInfo *WI)
+{
+  unsigned FID;
+  StructType *ST;
+  LLVMContext &C = HM->getContext();
+  const DataLayout &DL = HM->getDataLayout();
+  std::tie(std::ignore, FID, ST) = *HW->getMapping(CI->getCalledFunction());
+
+  Value *FIDV = ConstantInt::get(Type::getInt32Ty(C), FID);
+
+  // a) Allocates and sets the Memory Access Info Array
+  Value *MemAccessInfo =
+    allocateAndSetMemAccessInfoArray(II, CI, WI, C, MemoryAccessInfoT);
+
+  // b) Enforces coherency by calling __hexe_enforce_coherency
+  Value *MemAccNV = ConstantInt::get(Type::getInt32Ty(C),
+                                     WI->getInterfaceMemRefs().size() +
+                                     WI->getGlobalMemRefs().size());
+  Value *CohArgs[] = { FIDV, MemAccessInfo, MemAccNV };
+  CallInst::Create(HexeCoherencyCall, ArrayRef<Value *>(CohArgs, 3), "", II);
+
+  // c) Marshals the call arguments
+  Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) );
+  AllocaInst *SA = new AllocaInst(ST, "offload_data", II);
+  prologMarshalling(II, SA, ST, WI, CI, C);
+
+  // d) Performs the offloading by calling __hexe_dispatch
+  Value *CArgsP = new BitCastInst(SA, VoidPtr, "", II);
+  Value *CArgsSize = ConstantInt::get(Type::getInt32Ty(C),
+                                      DL.getTypeAllocSize(ST) );
+  Value *Args[] = { FIDV, CArgsP, CArgsSize };
+
+  Value *Event = CallInst::Create(HexeDispatchCall, ArrayRef<Value *>(Args, 3),
+                                  "hexe_offload", II);
+
+  // e) Waits for its completion by calling __hexe_event_wait
+  CallInst::Create(HexeEventWaitCall, ArrayRef<Value *>(Event),
+                   "hexe_wait", II);
+
+  // f) Enforces coherency by calling __hexe_enforce_coherency
+  CallInst::Create(HexeCoherencyCall, ArrayRef<Value *>(CohArgs, 3),
+                   "", II);
+
+  // g) Reads the return value if any
+  return epilogueMarshalling(II, SA, ST, WI, CI, C);
+}
+
+/// \brief It inserts a runtime call to __hexe_runtime_sched
+/// function which controls at runtime if the Workload will
+/// be executed on the CPU or the accelerator.
+///
+/// Future plans (TODO): support runtime checks via this call.
+/// We should provide access to the Memory Access Info Array
+/// and Kernel Code Description.
+Value *WorkloadExtractUtil::injectRuntimeSched(Instruction *II,
+                                               CallInst *CI, WorkloadInfo *WI)
+{
+  unsigned FID;
+  LLVMContext &C = HM->getContext();
+  std::tie(std::ignore, FID, std::ignore) = *HW->getMapping(
+      CI->getCalledFunction());
+
+  //void __hexe_runtime_sched(unsigned id, MemoryAccessInfo *,
+  //unsigned num_elems, hexe_kernel_info__t *);
+
+  Value *FIDV = ConstantInt::get(Type::getInt32Ty(C), FID);
+  Value *MemAccessP =
+    ConstantPointerNull::get(PointerType::getUnqual(MemoryAccessInfoT));
+  Value *MemAccessNumV = ConstantInt::get(Type::getInt32Ty(C), 0);
+  Value *KernelInfoP =
+    ConstantPointerNull::get(PointerType::getUnqual(HexeKernelInfoT));
+
+  Value *Args[] = { FIDV, MemAccessP, MemAccessNumV, KernelInfoP };
+  Value *D = CallInst::Create(HexeRuntimeSchedCall, ArrayRef<Value *>(Args, 4),
+                              "hexe_sched_decision", II);
+
+  return CastInst::CreateIntegerCast(D, Type::getInt1Ty(C), false,
+                                     "hexe_sched_decision_cast", II);
+}
+
+/// \brief This functions performs transformations on the Basic Block
+/// that contains the function call we decided to offload. It also
+/// injects the required Hexe runtime library Calls.
+///
+/// We split the Basic Block at the function call site and we insert
+/// a runtime scheduling call. Based on the value it returns, the
+/// Workload will either be executed on the CPU or the accelerator.
+/// We create two new Basic Blocks. The one performs the execution
+/// on the CPU, the other performs the necessary actions in order
+/// to offload to the accelerator. Finally we have a Converge Block
+/// with a Phi Node for the function call return values. After the
+/// PhiNode instructions we append the original instructions.
+///
+/// Transformation Overview
+///                                           BB before Fun Call
+///                                           +------------------+
+///                                           | Instruction 0    |
+///                                           | Instruction 1    |
+///                                           | Instruction 2    |
+///  Original Basic Block                     | Instruction 3    |
+///  +------------------+                     |                  |
+///  | Instruction 0    |                     |Call Runtime_Sched|
+///  | Instruction 1    |                     |SChed Branch      |
+///  | Instruction 2    |                     +----+------+------+
+///  | Instruction 3    |          CPU Execution   |      |  Offloading
+///  |                  |          Basic Block     |      |  Basic Block
+///  | ..............   |                  |-------+      +------|
+///  |                  |  +----> +------------------+  +------------------+
+///  | CallInst @Func   |         | CallInst @Func   |  | MemAccessInfo set|
+///  |                  |         +------+-----------+  | Enforce Coherency|
+///  | ..............   |                |              | Call Data Marsh. |
+///  | Instruction N    |                |              | Dispatch Workload|
+///  | Instruction N+1  |                |              | Wait Completion  |
+///  | ..............   |                |              | Enforce Coherency|
+///  |                  |                |              | Read Return Value|
+///  |                  |                |              +-------------+----+
+///  |                  |                |                            |
+///  +------------------+                |                            |
+///                                      |                            |
+///                                      +----+Merge Basic Block  +---+
+///                                            +------------------+
+///                                            |PhiNode(Ret. Val) |
+///                                            | ..............   |
+///                                            | Instruction N    |
+///                                            | Instruction N+1  |
+///                                            | ..............   |
+///                                            +------------------+
+///
+bool WorkloadExtractUtil::transformAndInjectGlueCode(WorkloadInfo *WI)
+{
+  TerminatorInst *ThenTerm;
+  TerminatorInst *ElseTerm;
+  CallInst *OCI = WI->getCall();
+
+  //insert runtime scheduling call
+  Value *SchedCond = injectRuntimeSched(OCI, OCI, WI);
+
+  //split the Basic Block
+  SplitBlockAndInsertIfThenElse(SchedCond, OCI, &ThenTerm, &ElseTerm);
+
+  //insert Code for offloading
+  Instruction *CallThen = marshalAndOffload(ThenTerm, OCI, WI);
+
+  //insert Code for cpu execution
+  Instruction *CallElse = OCI->clone();
+  CallElse->insertBefore(ElseTerm);
+
+  //insert Phi Node for the return value
+  FunctionType *FT = OCI->getCalledFunction()->getFunctionType();
+  if (!FT->getReturnType()->isVoidTy()) {
+    Instruction *InPos = OCI->getParent()->getFirstNonPHI();
+    Type *PHIType = OCI->getCalledFunction()->getReturnType();
+
+    PHINode *PN = PHINode::Create(PHIType, 2, "hexe_converge", InPos);
+    PN->addIncoming(CallThen, ThenTerm->getParent());
+    PN->addIncoming(CallElse, ElseTerm->getParent());
+
+    OCI->replaceAllUsesWith(PN);
+  }
+
+  OCI->eraseFromParent();
+  return true;
+}
+
+/// \brief Annotates the Global Variable, Heap and Stack allocations
+/// used by the Workload for replacement with equivalents that use
+/// the Hexe library functions for memory allocation.
+void WorkloadExtractUtil::annotateMemAllocationsForReplacement(
+  WorkloadInfo *WI)
+{
+  GlobalAllocations.insert(WI->getGlobalAllocations().begin(),
+                           WI->getGlobalAllocations().end());
+  MallocAllocations.insert(WI->getMallocAllocations().begin(),
+                           WI->getMallocAllocations().end());
+  AllocaAllocations.insert(WI->getAllocaAllocations().begin(),
+                           WI->getAllocaAllocations().end());
+}
+
+bool WorkloadExtractUtil::extractWorkloadCode(WorkloadInfo *WI)
+{
+  WorkloadInfo *W2E;
+  WorkloadInfo Tmp;
+  DominatorTreeWrapperPass &DTW =
+    HP->getAnalysis<DominatorTreeWrapperPass>(*(WI->getFunction()));
+
+  DominatorTree *DT = &DTW.getDomTree();
+  WA->setForOffloading(WI);
+
+  //If the Workload is a Loop we need to transform it to a function
+  //first.
+  if (WI->isLoop()) {
+    loopToFunction(WI, &Tmp, DT);
+    W2E = &Tmp;
+  } else
+    W2E = WI;
+
+  //annotate the Memory Allocations used by the Workload
+  //for replacement.
+  annotateMemAllocationsForReplacement(W2E);
+
+  //Clone the function on the Hexe Workload Module.
+  cloneOrGetHexeFunction(W2E);
+
+  //transform control flow and inject runtime calls.
+  transformAndInjectGlueCode(W2E);
+
+  return true;
+}
+
+/// \brief it replace a malloc function call with a call to hexe_malloc,
+/// the malloc function equivalent of the Hexe runtime library.
+static void replaceMalloc(CallInst *M, Function *HexeMalloc, Module *HM)
+{
+  LLVMContext &C = HM->getContext();
+  //0 alignment set for now,
+  //meaning the implementation uses the
+  //standard alignment.
+  Value *Al = ConstantInt::get(Type::getInt32Ty(C), 0);
+  Value *Args[] = { M->getArgOperand(0), Al };
+  CallInst *HMC =
+    CallInst::Create(HexeMalloc, ArrayRef<Value *>(Args, 2), "", M);
+
+  M->replaceAllUsesWith(HMC);
+  M->eraseFromParent();
+}
+
+/// \brief it replaces a free function call with a call to hexe_free,
+/// the free function equivalent of the Hexe runtime library.
+static void replaceFree(CallInst *F, Function *HexeFree, Module *HM)
+{
+  CallInst *HFC =
+    CallInst::Create(HexeFree,
+                     ArrayRef<Value *>(F->getArgOperand(0)), "", F);
+
+  F->replaceAllUsesWith(HFC);
+  F->eraseFromParent();
+}
+
+
+// >> Replace Global Variable
+//
+// key Goal: Replace Global Variables with equivalents that use
+// memory provided by the Hexe runtime facilities instead of using
+// the data segment. Make the minimal changes.
+//
+// How LLVM handles Global Variables:
+// A Global Variable of type T is handled as Pointer T (T *) value.
+// Whenever a function needs to access the value it performs load
+// and store operations.
+//
+// The replacement strategy:
+//
+// For every Global Variable:
+//
+// a) We replace its type from T to T*, then this variable is handled
+// as T ** value.
+//
+// b) We insert a load instruction for every user of the variable so
+// the user receives a T * value as before.
+//
+// c) We generate Constructor and Destructor functions that allocate
+// memory for the Global Variable via the Hexe memory management
+// functions.
+//
+
+// It keeps track of the Old and New Global Variables so
+// we can build the constructor and destructor.
+typedef std::tuple<GlobalVariable *, Type *, GlobalVariable *>
+  CtorDctorT;
+
+
+/// \brief
+/// It transforms all the uses of a Constant Expression
+/// For every ConstantExpr use it creates an actual instruction
+/// that perform the exact operation. It insert a new
+/// instruction just before every user of the ConstantExpr.
+/// It updates GUsers by inserting those new instructions.
+static void handleConstantExpr(ConstantExpr *CE,
+                               std::queue<Value *> &GUsers)
+{
+  std::set<Value *> Users;
+  for (auto I = CE->use_begin(), IE = CE->use_end(); I!= IE; ++I)
+    Users.insert(I->getUser());
+
+  for (auto I = Users.begin(), IE = Users.end(); I!= IE; ++I) {
+    Instruction *Inst = dyn_cast<Instruction>(*I);
+    Instruction *CInst = CE->getAsInstruction();
+    CInst->insertBefore(Inst);
+    Inst->replaceUsesOfWith(CE, CInst);
+    GUsers.push(CInst);
+  }
+}
+
+/// \brief This function replaces a Global Variable of Type
+/// T with one of Type T *. Then for every user of the original
+/// variables a new load instruction is inserted so the user
+/// uses again a Global Variable of Type T.
+static void replaceGlobalVars(GlobalVariable *GV, Module *M,
+                              std::vector<CtorDctorT> &C)
+{
+  //insert new variable
+  PointerType *GVT = GV->getType();
+  GlobalVariable *NGV =
+    new GlobalVariable(*M, GVT, GV->isConstant(),
+                       GV->getLinkage(), ConstantPointerNull::get(GVT),
+                       GV->getName()+"_hexe");
+  NGV->copyAttributesFrom(GV);
+
+  //handle ConstantExprs and update the Global Variable uses.
+  std::queue<Value *> Users;
+  for (auto I = GV->use_begin(), IE = GV->use_end(); I!= IE; ++I)
+    Users.push(I->getUser());
+
+  while (Users.size()) {
+    Value *User = Users.front();
+    Users.pop();
+    Instruction *Inst;
+
+    if (isa<ConstantExpr>(User)) {
+      handleConstantExpr(dyn_cast<ConstantExpr>(User), Users);
+      continue;
+    } else
+      Inst = dyn_cast<Instruction>(User);
+
+    //insert the new load instruction
+    LoadInst *LI = new LoadInst(NGV, "", Inst);
+    Inst->replaceUsesOfWith(GV, LI);
+  }
+
+  //keep track of the New and Old Global Variables
+  //so we can build the constructor and destructor later.
+  C.push_back( std::make_tuple(NGV, GV->getType(), GV) );
+}
+
+/// \brief This function builds a constructor for all the Global Variables
+/// we replace. It allocates memory via the Hexe library memory management
+/// functions and also initializes that memory based on the content
+// of the original Global Variables.
+static void createGlobalConstructor(const std::vector<CtorDctorT> &G,
+                                    Module *M, Function *HexeMalloc)
+{
+  LLVMContext &C = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+  Type *Void = Type::getVoidTy(C);
+  Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) );
+  Type *Int32 = Type::getInt32Ty(C);
+  Type *Int1 = Type::getInt1Ty(C);
+
+  //get memcpy intrinsic declaration
+  Type *MemCpyType[] = { VoidPtr, VoidPtr, Int32 };
+  Function *Mcopy =
+    Intrinsic::getDeclaration(M, Intrinsic::memcpy,
+                              ArrayRef<Type *>(MemCpyType, 3) );
+
+  //build constructor interface
+  FunctionType *FT = FunctionType::get(Void, false);
+  Function *F =
+    Function::Create(FT, GlobalValue::ExternalLinkage,
+                     "hexe.constructor", M);
+  F->setCallingConv(CallingConv::C);
+
+  BasicBlock *BB = BasicBlock::Create(C, "", F);
+
+  for (auto I = G.begin(), IE = G.end(); I!= IE; ++I) {
+    GlobalVariable *GV;
+    Type *AT;
+    GlobalValue *In;
+    std::tie(GV, AT, In) = *I;
+
+    //allocate memory with hexe_malloc
+    Value *Size = ConstantInt::get(Type::getInt32Ty(C), DL.getTypeAllocSize(AT));
+    Value *Al = ConstantInt::get(Type::getInt32Ty(C), 0);
+    Value *Args[] = { Size, Al };
+    CallInst *HMC = CallInst::Create(HexeMalloc, ArrayRef<Value *>(Args, 2),
+                                     "", BB);
+
+    //initialize by copying the original Global Variable content
+    Instruction *IC = new BitCastInst(In, VoidPtr, "", BB);
+    Value *Margs[] = { HMC, IC,
+      ConstantInt::get(Int32, DL.getTypeAllocSize(AT)),
+      ConstantInt::get(Int32, DL.getABITypeAlignment(AT)),
+      ConstantInt::get(Int1, 0)
+    };
+    CallInst::Create(Mcopy, ArrayRef<Value *>(Margs, 5), "", BB);
+    Instruction *TC =
+      new BitCastInst(HMC, GV->getType()->getPointerElementType(), "", BB);
+    new StoreInst(TC, GV, BB);
+  }
+
+  ReturnInst::Create(C, BB);
+
+  appendToGlobalCtors(*M, F, 65535);
+}
+
+/// \brief This function builds a destructor for all the Global Variables
+/// we replace. It releases their memory via the Hexe library memory
+/// management functions.
+static void createGlobalDestructor(const std::vector<CtorDctorT> &G,
+                                   Module *M, Function *HexeFree)
+{
+  LLVMContext &C = M->getContext();
+  Type *Void = Type::getVoidTy(C);
+  Type *VoidPtr = PointerType::getUnqual(Type::getInt8Ty(C));
+
+  //build destructor interface
+  FunctionType *FT = FunctionType::get(Void, false);
+  Function *F = Function::Create(FT, GlobalValue::ExternalLinkage,
+                                "hexe.destructor", M);
+  F->setCallingConv(CallingConv::C);
+
+  BasicBlock *BB = BasicBlock::Create(C, "", F);
+
+  for (auto I = G.begin(), IE = G.end(); I!= IE; ++I) {
+    GlobalVariable *GV;
+    std::tie(GV, std::ignore, std::ignore) = *I;
+
+    //calls hexe_free
+    Instruction *LI = new LoadInst(GV, "", BB);
+    Instruction *TC = CastInst::CreatePointerCast(LI, VoidPtr, "", BB);
+    CallInst::Create(HexeFree, ArrayRef<Value *>(TC), "", BB);
+  }
+
+  ReturnInst::Create(C, BB);
+
+  appendToGlobalDtors(*M, F, 65535);
+}
+
+/// \brief It detects the Basic Blocks of a Function
+/// F that contain return instructions.
+static void getFunctionReturnBlocks(Function *F,
+                                    std::set<BasicBlock *> &RBlocks)
+{
+  for (auto I = F->begin(), IE = F->end(); I!= IE; ++I) {
+    if (!isa<ReturnInst>(I->getTerminator()))
+      continue;
+    RBlocks.insert(I);
+  }
+}
+
+/// \brief
+/// It gets the Descendant Basic Blocks of a Block by
+/// traversing the CFG. The Descendant Blocks are stored
+/// in DBlocks
+static void getBasicBlockDescendants(BasicBlock *BB,
+                                     std::set<BasicBlock *> &DBlocks)
+{
+  for (auto I : successors(BB)) {
+    DBlocks.insert(I);
+    getBasicBlockDescendants(I, DBlocks);
+  }
+}
+
+
+/// \brief This function replaces stack allocations
+/// with Hexe memory allocation calls.
+///
+/// We replace alloca instruction with hexe_malloc
+/// calls. The tricky part here is that by replacing
+/// a stack allocation with hexe_malloc we have to
+/// explicitly release the memory when it is not in
+/// use any more.
+/// We get all the Basic Blocks of the function that have
+/// a return terminator instruction. We then investigate
+/// which of them are reachable from the Basic Block
+/// where we replaced the alloca with hexe_malloc.
+/// We insert a hexe_free in every return Basic Block
+/// that is reachable.
+static void replaceAlloca(AllocaInst *AI, Module *M,
+                          Function *HexeMalloc,
+                          Function *HexeFree)
+{
+  LLVMContext &C = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+  Type *Int32 = Type::getInt32Ty(C);
+  Function *HostF = AI->getParent()->getParent();
+  BasicBlock *HostBB = AI->getParent();
+
+  //a) replace with hexe malloc
+  Value *Size =
+    ConstantInt::get(Int32, DL.getTypeAllocSize(AI->getAllocatedType()));
+  if (!Size->getType()->isIntegerTy(32))
+      Size = CastInst::CreateIntegerCast(Size, Int32, false, "", AI);
+
+  // get the allocation size
+  if (AI->isArrayAllocation()) {
+    Value *AElems = AI->getArraySize();
+    if (!AElems->getType()->isIntegerTy(32))
+      AElems = CastInst::CreateIntegerCast(AElems, Int32, false, "", AI);
+    Size = BinaryOperator::Create(Instruction::Mul, Size, AElems, "", AI);
+  }
+
+  Value *Al = ConstantInt::get(Type::getInt32Ty(C), 0);
+  Value *Args[] = { Size, Al };
+  CallInst *HMC =
+    CallInst::Create(HexeMalloc, ArrayRef<Value *>(Args, 2), "", AI);
+
+  Value *CMC;
+  if (AI->getType()!= HMC->getType())
+    CMC = CastInst::CreatePointerCast(HMC, AI->getType(), "", AI);
+  else
+    CMC = HMC;
+
+  //b) replace uses
+  AI->replaceAllUsesWith(CMC);
+  AI->eraseFromParent();
+
+  //c) insert hexe_free calls
+  //we replace allocas with hexe_mallocs which require
+  //calling hexe_free to release the allocated memory
+
+
+  //We get all the Basic Blocks of the function that have
+  //a return terminator instruction. We then investigate
+  //which of them are reachable from the Basic Block
+  //where we replaced the alloca with hexe_malloc.
+  //We insert a hexe_free in every return Basic Block
+  //that is reachable.
+  std::set<BasicBlock *> RBlocks, DBlocks;
+  getFunctionReturnBlocks(HostF, RBlocks);
+  getBasicBlockDescendants(HostBB, DBlocks);
+
+  for (auto I = RBlocks.begin(), IE = RBlocks.end(); I!= IE; ++I) {
+    if (DBlocks.count(*I))
+      CallInst::Create(HexeFree, ArrayRef<Value *>(HMC),
+                       "", (*I)->getTerminator());
+  }
+}
+
+bool WorkloadExtractUtil::replaceMemAllocations()
+{
+  //replace malloc calls
+  for (auto I = MallocAllocations.begin(),
+       IE = MallocAllocations.end(); I!= IE; ++I)
+    replaceMalloc(*I, HexeMalloc, HM);
+
+  //replace global variables
+  std::vector<CtorDctorT> C;
+  for (auto I = GlobalAllocations.begin(),
+       IE = GlobalAllocations.end(); I!= IE; ++I)
+    replaceGlobalVars(*I, HM, C);
+  createGlobalConstructor(C, HM, HexeMalloc);
+  createGlobalDestructor(C, HM, HexeFree);
+
+  //replace allocas
+  for (auto I = AllocaAllocations.begin(), IE = AllocaAllocations.end();
+       I!= IE; ++I)
+    replaceAlloca(*I, HM, HexeMalloc, HexeFree);
+
+  //replace all free calls
+  //Due to the limitations of alias analysis and pointer escaping,
+  //it is unfeasible to track the free call that releases a specific
+  //malloc allocation. For that reason we replace all the free calls
+  //with calls to hexe_free function. We then resolve at runtime if
+  //an allocation was served by the standard malloc or hexe_malloc.
+  std::vector<CallInst *> FreeCalls;
+  TargetLibraryInfo *TLI = WA->getTLI();
+  //function iterator
+  for (auto I = HM->begin(), IE = HM->end(); I!= IE; ++I)
+    //BasicBlock iterator
+    for (auto J = I->begin(), JE = I->end(); J!= JE; ++J)
+      // Instruction Iterator
+      for (auto K = J->begin(), KE = J->end(); K!= KE; ++K)
+        if (isFreeCall(K, TLI))
+          FreeCalls.push_back(dyn_cast<CallInst>(K));
+
+
+  for (auto I = FreeCalls.begin(), IE = FreeCalls.end(); I!= IE; ++I)
+    replaceFree(dyn_cast<CallInst>(*I), HexeFree, HM);
+
+  return true;
+}
+
+/// \brief This utility function either gets or inserts
+/// a function declaration to a module.
+static void insertOrGetFunctionGValue(StringRef FName, FunctionType *FT,
+                                      Module *M, Function * &F )
+{
+  F = M->getFunction(FName);
+  if (!F) {
+    F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M);
+    F->setCallingConv(CallingConv::C);
+  }
+}
+
+
+/// \brief It adds the function declarations of the Hexe runtime
+/// interface to the Module. It also defines the struct data types
+/// used by these functions.
+void WorkloadExtractUtil::setHexeRTFunctions()
+{
+  LLVMContext &C = HM->getContext();
+
+  StructType *EventT = StructType::create(C, "hexe_event_t");
+  PointerType *EventTP = PointerType::getUnqual(EventT);
+  Type *VoidPtr = PointerType::getUnqual( Type::getInt8Ty(C) );
+  Type *Int32 = Type::getInt32Ty(C);
+  Type *Void = Type::getVoidTy(C);
+
+  //struct MemAccessInfo{
+  //  void *ptr;
+  //  unsigned size;
+  //  unsigned arg_order;
+  //  access_mode; // read 0, write 1, readwrite 3;
+  // };
+
+  Type *MemAccessInfoFields[] =  {
+    VoidPtr, Int32, Int32,
+    Type::getInt8Ty(C)
+  };
+  StructType *MAST =
+    StructType::get(C, ArrayRef<Type *>(MemAccessInfoFields, 4),
+                    "hexe_mem_access_info_t");
+  Type *MASTP = PointerType::getUnqual(MAST);
+
+  // Hexe Kernel Info Type
+  StructType *KernelInfoT = StructType::create(C, "hexe_kernel_info__t");
+  PointerType *KernelInfoTP = PointerType::getUnqual(KernelInfoT);
+
+  //int __hexe_runtime_sched(unsigned function_id, MemoryAccessInfo *,
+  //unsigned num_elems, hexe_kernel_info__t *);
+
+  Type *HRSArgs[] = { Int32, MASTP, Int32, KernelInfoTP };
+  FunctionType *HRSFT =
+    FunctionType::get(Int32, ArrayRef<Type *>(HRSArgs, 4), false);
+  insertOrGetFunctionGValue("__hexe_runtime_sched", HRSFT,
+                            HM, HexeRuntimeSchedCall);
+
+  //hexe_event_t *__hexe_dispatch(int function_id, void *args, int args_size);
+  Type *HDFArgs[] = { Int32, VoidPtr, Int32 };
+  FunctionType *HDFT =
+    FunctionType::get(EventTP, ArrayRef<Type *>(HDFArgs, 3), false);
+  insertOrGetFunctionGValue("__hexe_dispatch", HDFT, HM, HexeDispatchCall);
+
+  //void __hexe_enforce_coherency(int functionid, MemoryAccessInfo *,
+  //unsigned num_elems);
+  Type *HECArgs[] = { Int32, MASTP, Int32  };
+  FunctionType *HECFT =
+    FunctionType::get(Void, ArrayRef<Type *>(HECArgs, 3), false);
+  insertOrGetFunctionGValue("__hexe_enforce_coherency", HECFT,
+                            HM, HexeCoherencyCall);
+
+  //int __hexe_event_wait(hexe_event_t *event);
+  Type *HHWArgs[] = { EventTP };
+  FunctionType *HHWFT =
+    FunctionType::get(Int32, ArrayRef<Type *>(HHWArgs, 1), false);
+  insertOrGetFunctionGValue("__hexe_event_wait", HHWFT,
+                            HM, HexeEventWaitCall);
+
+  //void *__hexe_malloc(unsigned size, unsigned alignment);
+  Type *HMArgs[] = { Int32, Int32 };
+  FunctionType *HMFT =
+    FunctionType::get(VoidPtr, ArrayRef<Type *>(HMArgs, 2), false);
+  insertOrGetFunctionGValue("__hexe_malloc", HMFT, HM, HexeMalloc);
+
+  //void __hexe_free(void *ptr);
+  Type *HFArgs[] = { VoidPtr };
+  FunctionType *HFFT =
+    FunctionType::get(Void, ArrayRef<Type *>(HFArgs, 1), false);
+  insertOrGetFunctionGValue("__hexe_free", HFFT, HM, HexeFree);
+
+  HexeEventT = EventT;
+  MemoryAccessInfoT = MAST;
+  HexeKernelInfoT = KernelInfoT;
+}
+
+char WorkloadExtractor::ID = 1;//just because everyone sets it to 0
+INITIALIZE_PASS_BEGIN(WorkloadExtractor,
+    "hexe-extract", "Hexe Workload Extractor", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominanceFrontier)
+INITIALIZE_PASS_DEPENDENCY(WorkloadAnalysis)
+INITIALIZE_PASS_END(WorkloadExtractor,
+    "hexe-extract", "Hexe Workload Extractor", false, false)
+
+ModulePass *createWorkloadExtractorPass()
+{
+  return new WorkloadExtractor();
+}
+
+
+WorkloadExtractor::WorkloadExtractor():ModulePass(ID),
+  WA(nullptr), HW(nullptr), WEU(nullptr)
+{
+  initializeWorkloadExtractorPass(*PassRegistry::getPassRegistry());
+}
+
+void WorkloadExtractor::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<DominanceFrontier>();
+  AU.addRequired<WorkloadAnalysis>();
+}
+
+bool WorkloadExtractor::runOnModule(Module &M)
+{
+  LLVMContext &C = M.getContext();
+  WA = &getAnalysis<WorkloadAnalysis>();
+  HW = new HexeWorkload(C);
+
+  WEU = new WorkloadExtractUtil(WA, &M, this, true);
+  WEU->registerHexeWorkload(HW);
+
+  if (HexeFunctionCalls) { //compiler option defined in Hexe.cpp
+    //enable for offloading all the Function Call Workloads
+
+    std::map<CallInst *, WorkloadInfo *> &WDs = WA->getCallWorkloads();
+    for (auto I = WDs.begin(), IE = WDs.end(); I!= IE; ++I)
+      WEU->extractWorkloadCode(I->second);
+  }
+
+  if (HexeLoops) { //compiler option define in Hexe.cpp
+    //enable for offloading Function Loop Workloads.
+
+    std::map<Loop *, WorkloadInfo *> &WDs = WA->getLoopWorkloads();
+    while (WDs.size()) {
+      Loop *L;
+      WorkloadInfo *WI;
+      std::tie(L, WI) = *(WDs.begin());
+      WEU->extractWorkloadCode(WI);
+    }
+  }
+
+  //TODO EXTEND TO CONSIDER SPECIFIC HEXE POLICIES
+
+  //replace the memory allocations used by workloads
+  //that have been transformed for offloading.
+  WEU->replaceMemAllocations();
+
+  //make sure that the memory intrinsics used
+  //in the Hexe Workload Module have been
+  //declared properly.
+  HW->validateSupportedFunctionCalls();
+
+  //set the host DataLayout and Triple
+  //to the Hexe Workload
+  HW->setDLandTriple(M.getDataLayout(),
+                     Triple(M.getTargetTriple()));
+
+  //write Hexe Workload to file
+  //filename defined by the HexeWorkloadFName
+  //compiler option, defined in Hexe.cpp
+  HW->writeFile(HexeWorkloadFName);
+  return true;
+}
+
+void WorkloadExtractor::releaseMemory()
+{
+  delete HW;
+  delete WEU;
+}
Index: lib/Transforms/Hexe/WorkloadTransform.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Hexe/WorkloadTransform.cpp
@@ -0,0 +1,107 @@
+// ===- WorkloadTransform.cpp - Heterogeneous Execution Engine -*- C++ -*-=== //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===--------------------------------------------------------------------=== //
+/// \file
+/// Implementation of the Workload Transform Pass of the Heterogeneous
+/// Executon Engine. Please read the header file documentation for high
+/// level description.
+// ===--------------------------------------------------------------------=== //
+
+
+#include "llvm/Transforms/Hexe/WorkloadTransform.h"
+#include "llvm/Transforms/Hexe/HeterogeneousAdaptors/Adaptors.h"
+#include "llvm/Transforms/Hexe/Hexe.h"
+#include "llvm/Transforms/Hexe/InitializeHexePasses.h"
+#include "llvm/Transforms/Hexe/Utils.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Host.h"
+
+#define DEBUG_TYPE "hwt"
+
+using namespace llvm;
+
+char WorkloadTransform::ID = 1;//just because everyone sets it to 0
+INITIALIZE_PASS_BEGIN(WorkloadTransform, "hexe-transform",
+    "Hexe Workload Transform", false, false)
+INITIALIZE_PASS_END(WorkloadTransform, "hexe-transform",
+    "Hexe Workload Transform", false, false)
+
+
+ModulePass *createWorkloadTransformPass()
+{
+  return new WorkloadTransform();
+}
+
+
+WorkloadTransform::WorkloadTransform():ModulePass(ID)
+{
+  initializeWorkloadTransformPass(*PassRegistry::getPassRegistry());
+}
+
+void WorkloadTransform::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  //empty for now....
+}
+
+bool WorkloadTransform::runOnModule(Module &M)
+{
+  LLVMContext &C = M.getContext();
+
+  //get the Accelerator Target Triple
+  std::string AccelTripleStr = sys::getDefaultTargetTriple();
+  Triple AccelTriple(AccelTripleStr);
+
+  Triple HostTriple; //it reads the Host Triple from the Hexe Metadata.
+
+  //read Hexe Metadata
+  readHexeMetadata(&M, functionNum, FL, HostTriple);
+
+  //Create an instance of the requested Adaptor
+  //HexeAdaptor is a compiler flag defined in Hexe.cpp
+  HexeWorkloadAdaptor *HWT =
+    StringSwitch<HexeWorkloadAdaptor *>(HexeAdaptor)
+    .Case("hexagon", createHexagonWorkloadAdaptor())
+    .Default(nullptr);
+
+  if (!HWT)
+    C.emitError("The Accelerator Adaptor is not supported.");
+
+  //checks if the Adaptor supports IR transformations
+  //for the specific combination of Host and Accelerator Triples
+  //This check can be skipped by setting the HexeAdaptorCheck flag
+  //to false
+  if (HexeAdaptorCheck && !HWT->isSupported(HostTriple, AccelTriple)) {
+    std::string M = "Host: ";
+    M += HostTriple.str();
+    M += " Accelerator: ";
+    M += AccelTriple.str();
+    M += " not supported.";
+    C.emitError(M);
+  }
+
+  //performs the transformations
+  HWT->transform(&M, HostTriple, AccelTriple, FL);
+
+  //deletes the adaptor instance
+  delete HWT;
+
+  //remove Hexe related Metadata from the Module
+  eraseHexeMetadata(&M);
+
+  return true;
+}
+
+void WorkloadTransform::releaseMemory()
+{
+  FL.clear();
+}
+
Index: test/Transforms/Hexe/analysistest1.ll
===================================================================
--- /dev/null
+++ test/Transforms/Hexe/analysistest1.ll
@@ -0,0 +1,599 @@
+; RUN: opt -S -hexe-analysis -analyze < %s | FileCheck %s
+
+; Code and Memory Reference Checks, Functions
+
+; CHECK: add:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: add_g:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs: G
+
+; CHECK: add_p:
+; CHECK-NEXT: InterfaceMemRefs: x y
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: add_pg:
+; CHECK-NEXT: InterfaceMemRefs: x
+; CHECK-NEXT: GlobalMemRefs: G
+
+; CHECK: comp:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: comp_g:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs: G
+
+; CHECK: comp_p:
+; CHECK-NEXT: InterfaceMemRefs: x y
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: comp_pg:
+; CHECK-NEXT: InterfaceMemRefs: x
+; CHECK-NEXT: GlobalMemRefs: G
+
+
+; Code and Memory Reference Checks, Loops
+
+; CHECK: for.body, Parent Function: comp:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: for.body, Parent Function: comp_g:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: for.body, Parent Function: comp_p:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: for.body, Parent Function: comp_pg:
+; CHECK-NEXT: InterfaceMemRefs:
+; CHECK-NEXT: GlobalMemRefs:
+
+
+; Memory Allocation Use Checks, Function Calls
+
+; CHECK: call, Callee: add Caller: call_point:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call, Callee: add Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call1, Callee: comp Caller: call_point:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call1, Callee: comp Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call10, Callee: comp_p Caller: call_point:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations: xb yb
+
+; CHECK: call12, Callee: comp_pg Caller: call_point:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations: xb
+
+; CHECK: call4, Callee: add_p Caller: call_point:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations: call2 call3
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call4, Callee: add_p Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations: call2 call3
+; CHECK-NEXT:AllocaAllocations:
+
+; CHECK: call5, Callee: add_g Caller: call_point:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call5, Callee: add_g Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call6, Callee: add_pg Caller: call_point:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations: call2
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call6, Callee: add_pg Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations: call2
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call7, Callee: comp_p Caller: call_point:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations: call2 call3
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call7, Callee: comp_p Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations: call2 call3
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call8, Callee: comp_g Caller: call_point:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call8, Callee: comp_g Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call9, Callee: comp_pg Caller: call_point:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations: call2
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: call9, Callee: comp_pg Caller: call_point2:
+; CHECK-NEXT: GlobalAllocations: G
+; CHECK-NEXT: MallocAllocations: call2
+; CHECK-NEXT: AllocaAllocations:
+
+
+; Memory Allocation Use Checks, Loops
+
+; CHECK: for.body, Parent Function: comp:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: for.body, Parent Function: comp_g:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: for.body, Parent Function: comp_p:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: for.body, Parent Function: comp_pg:
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations:
+; CHECK-NEXT: AllocaAllocations:
+
+
+;#include <stdlib.h>
+;#include <stdio.h>
+;
+;//scalar arguments
+;int add(int x, int  y)
+;{
+;  return x+y;
+;}
+;
+;//sclar arguments, loop
+;int comp(int x, int  y, int lstep)
+;{
+;  int value=0;
+;  int i;
+;
+;  for(i=0; i<lstep; ++i)
+;  {
+;    if(i%2)
+;      value+=i*3 + x + y;
+;    else
+;      value+=i/3 - x -y;
+;  }
+;
+;  return value;
+;}
+;
+;int G=3;
+;
+;//pointer arguments
+;int add_p(int *x, int *y)
+;{
+;  return *x + *y;
+;}
+;
+;//scalar arguments and global variable
+;int add_g(int x)
+;{
+;  return x + G;
+;}
+;
+;//pointer arguments and global variable
+;int add_pg(int *x)
+;{
+;  return *x + G;
+;}
+;
+;
+;//pointer arguments, loop
+;int comp_p(int *x, int  *y, int lstep)
+;{
+;  int value=0;
+;  int i;
+;
+;  for(i=0; i<lstep; ++i)
+;  {
+;    if(i%2)
+;      value+=i*3 + *x + *y;
+;    else
+;      value+=i/3 - *x - *y;
+;  }
+;
+;  return value;
+;}
+;
+;//global variable, loop
+;int comp_g(int x, int lstep)
+;{
+;  int value=0;
+;  int i;
+;
+;  for(i=0; i<lstep; ++i)
+;  {
+;    if(i%2)
+;      value+=i*3 + x + G;
+;    else
+;      value+=i/3 - x - G;
+;  }
+;
+;  return value;
+;}
+;
+;//pointer arguments and global variable, loop
+;int comp_pg(int *x, int lstep)
+;{
+;  int value=0;
+;  int i;
+;
+;  for(i=0; i<lstep; ++i)
+;  {
+;    if(i%2)
+;      value+=i*3 + *x + G;
+;    else
+;      value+=i/3 - *x - G;
+;  }
+;
+;  return value;
+;}
+;
+;
+;int call_point()
+;{
+;  int value=add(10,20);
+;  int value2=comp(10,100,500);
+;
+;  int *x=malloc(sizeof(int));
+;  int *y=malloc(sizeof(int));
+;  *x=5;
+;  *y=10;
+;
+;  int value3=add_p(x,y);
+;  int value4=add_g(100);
+;  int value5=add_pg(x);
+;
+;  int value6=comp_p(x,y,500);
+;  int value7=comp_g(10,500);
+;  int value8=comp_pg(x,500);
+;
+;  free(x);
+;  free(y);
+;
+;  int xb=5, yb=10;
+;  int value6b=comp_p(&xb,&yb,500);
+;  int value7b=comp_g(10,500);
+;  int value8b=comp_pg(&xb,500);
+;
+;  return value+value2+value3+value4+value5+
+;    value6+value7+value8
+;    +value6b+value7b+value8b;
+;}
+;
+;
+;int call_point2()
+;{
+;  int value=add(10,20);
+;  int value2=comp(10,100,500);
+;
+;  int *x=malloc(sizeof(int));
+;  int *y=malloc(sizeof(int));
+;  *x=5;
+;  *y=10;
+;
+;  int value3=add_p(x,y);
+;  int value4=add_g(100);
+;  int value5=add_pg(x);
+;
+;  int value6=comp_p(x,y,500);
+;  int value7=comp_g(10,500);
+;  int value8=comp_pg(x,500);
+;
+;  free(x);
+;  free(y);
+;
+;  return value+value2+value3+value4+value5+
+;    value6+value7+value8;
+;}
+;
+;int main()
+;{
+;  call_point();
+;  call_point2();
+;}
+;
+
+@G = global i32 3, align 4
+
+; Function Attrs: noinline nounwind readnone
+define i32 @add(i32 %x, i32 %y) #0 {
+entry:
+  %add = add nsw i32 %y, %x
+  ret i32 %add
+}
+
+; Function Attrs: noinline nounwind readnone
+define i32 @comp(i32 %x, i32 %y, i32 %lstep) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %lstep, 0
+  br i1 %cmp14, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add = add i32 %y, %x
+  %0 = add i32 %lstep, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %value.015 = phi i32 [ 0, %for.body.lr.ph ], [ %selv, %for.body ]
+  %rem13 = and i32 %i.016, 1
+  %tobool = icmp ne i32 %rem13, 0
+  %mul = mul nsw i32 %i.016, 3
+  %add1 = add i32 %add, %mul
+  %div = sdiv i32 %i.016, 3
+  %sub3 = sub i32 %div, %add
+  %selv.v = select i1 %tobool, i32 %add1, i32 %sub3
+  %selv = add i32 %selv.v, %value.015
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %i.016, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %value.0.lcssa = phi i32 [ 0, %entry ], [ %selv, %for.body ]
+  ret i32 %value.0.lcssa
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @add_p(i32* nocapture readonly %x, i32* nocapture readonly %y) #1 {
+entry:
+  %0 = load i32, i32* %x, align 4, !tbaa !3
+  %1 = load i32, i32* %y, align 4, !tbaa !3
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @add_g(i32 %x) #1 {
+entry:
+  %0 = load i32, i32* @G, align 4, !tbaa !3
+  %add = add nsw i32 %0, %x
+  ret i32 %add
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @add_pg(i32* nocapture readonly %x) #1 {
+entry:
+  %0 = load i32, i32* %x, align 4, !tbaa !3
+  %1 = load i32, i32* @G, align 4, !tbaa !3
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @comp_p(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %lstep) #1 {
+entry:
+  %cmp15 = icmp sgt i32 %lstep, 0
+  br i1 %cmp15, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load i32, i32* %x, align 4, !tbaa !3
+  %1 = load i32, i32* %y, align 4, !tbaa !3
+  %sum = add i32 %1, %0
+  %2 = add i32 %lstep, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.017 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %value.016 = phi i32 [ 0, %for.body.lr.ph ], [ %selv, %for.body ]
+  %rem13 = and i32 %i.017, 1
+  %tobool = icmp ne i32 %rem13, 0
+  %mul = mul nsw i32 %i.017, 3
+  %add = add nsw i32 %0, %mul
+  %add1 = add nsw i32 %add, %1
+  %div = sdiv i32 %i.017, 3
+  %sub3 = sub i32 %div, %sum
+  %selv.v = select i1 %tobool, i32 %add1, i32 %sub3
+  %selv = add i32 %selv.v, %value.016
+  %inc = add nuw nsw i32 %i.017, 1
+  %exitcond = icmp eq i32 %i.017, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %value.0.lcssa = phi i32 [ 0, %entry ], [ %selv, %for.body ]
+  ret i32 %value.0.lcssa
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @comp_g(i32 %x, i32 %lstep) #1 {
+entry:
+  %cmp13 = icmp sgt i32 %lstep, 0
+  br i1 %cmp13, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load i32, i32* @G, align 4, !tbaa !3
+  %sum = add i32 %0, %x
+  %1 = add i32 %lstep, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.015 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %value.014 = phi i32 [ 0, %for.body.lr.ph ], [ %selv, %for.body ]
+  %rem12 = and i32 %i.015, 1
+  %tobool = icmp ne i32 %rem12, 0
+  %mul = mul nsw i32 %i.015, 3
+  %add = add nsw i32 %mul, %x
+  %add1 = add nsw i32 %add, %0
+  %div = sdiv i32 %i.015, 3
+  %sub3 = sub i32 %div, %sum
+  %selv.v = select i1 %tobool, i32 %add1, i32 %sub3
+  %selv = add i32 %selv.v, %value.014
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %i.015, %1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %value.0.lcssa = phi i32 [ 0, %entry ], [ %selv, %for.body ]
+  ret i32 %value.0.lcssa
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @comp_pg(i32* nocapture readonly %x, i32 %lstep) #1 {
+entry:
+  %cmp14 = icmp sgt i32 %lstep, 0
+  br i1 %cmp14, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load i32, i32* %x, align 4, !tbaa !3
+  %1 = load i32, i32* @G, align 4, !tbaa !3
+  %sum = add i32 %1, %0
+  %2 = add i32 %lstep, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %value.015 = phi i32 [ 0, %for.body.lr.ph ], [ %selv, %for.body ]
+  %rem12 = and i32 %i.016, 1
+  %tobool = icmp ne i32 %rem12, 0
+  %mul = mul nsw i32 %i.016, 3
+  %add = add nsw i32 %0, %mul
+  %add1 = add nsw i32 %add, %1
+  %div = sdiv i32 %i.016, 3
+  %sub3 = sub i32 %div, %sum
+  %selv.v = select i1 %tobool, i32 %add1, i32 %sub3
+  %selv = add i32 %selv.v, %value.015
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %i.016, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %value.0.lcssa = phi i32 [ 0, %entry ], [ %selv, %for.body ]
+  ret i32 %value.0.lcssa
+}
+
+; Function Attrs: noinline nounwind
+define i32 @call_point() #2 {
+entry:
+  %xb = alloca i32, align 4
+  %yb = alloca i32, align 4
+  %call = tail call i32 @add(i32 10, i32 20)
+  %call1 = tail call i32 @comp(i32 10, i32 100, i32 500)
+  %call2 = tail call noalias i8* @malloc(i32 4) #4
+  %0 = bitcast i8* %call2 to i32*
+  %call3 = tail call noalias i8* @malloc(i32 4) #4
+  %1 = bitcast i8* %call3 to i32*
+  store i32 5, i32* %0, align 4, !tbaa !3
+  store i32 10, i32* %1, align 4, !tbaa !3
+  %call4 = tail call i32 @add_p(i32* %0, i32* %1)
+  %call5 = tail call i32 @add_g(i32 100)
+  %call6 = tail call i32 @add_pg(i32* %0)
+  %call7 = tail call i32 @comp_p(i32* %0, i32* %1, i32 500)
+  %call8 = tail call i32 @comp_g(i32 10, i32 500)
+  %call9 = tail call i32 @comp_pg(i32* %0, i32 500)
+  tail call void @free(i8* %call2) #4
+  tail call void @free(i8* %call3) #4
+  store i32 5, i32* %xb, align 4, !tbaa !3
+  store i32 10, i32* %yb, align 4, !tbaa !3
+  %call10 = call i32 @comp_p(i32* %xb, i32* %yb, i32 500)
+  %call12 = call i32 @comp_pg(i32* %xb, i32 500)
+  %add = add nsw i32 %call1, %call
+  %add13 = add nsw i32 %add, %call4
+  %add14 = add nsw i32 %add13, %call5
+  %add15 = add nsw i32 %add14, %call6
+  %add16 = add nsw i32 %add15, %call7
+  %add17 = add nsw i32 %add16, %call8
+  %add18 = add nsw i32 %add17, %call9
+  %add19 = add nsw i32 %add18, %call10
+  %add20 = add nsw i32 %add19, %call8
+  %add21 = add nsw i32 %add20, %call12
+  ret i32 %add21
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i32) #3
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #3
+
+; Function Attrs: noinline nounwind
+define i32 @call_point2() #2 {
+entry:
+  %call = tail call i32 @add(i32 10, i32 20)
+  %call1 = tail call i32 @comp(i32 10, i32 100, i32 500)
+  %call2 = tail call noalias i8* @malloc(i32 4) #4
+  %0 = bitcast i8* %call2 to i32*
+  %call3 = tail call noalias i8* @malloc(i32 4) #4
+  %1 = bitcast i8* %call3 to i32*
+  store i32 5, i32* %0, align 4, !tbaa !3
+  store i32 10, i32* %1, align 4, !tbaa !3
+  %call4 = tail call i32 @add_p(i32* %0, i32* %1)
+  %call5 = tail call i32 @add_g(i32 100)
+  %call6 = tail call i32 @add_pg(i32* %0)
+  %call7 = tail call i32 @comp_p(i32* %0, i32* %1, i32 500)
+  %call8 = tail call i32 @comp_g(i32 10, i32 500)
+  %call9 = tail call i32 @comp_pg(i32* %0, i32 500)
+  tail call void @free(i8* %call2) #4
+  tail call void @free(i8* %call3) #4
+  %add = add nsw i32 %call1, %call
+  %add10 = add nsw i32 %add, %call4
+  %add11 = add nsw i32 %add10, %call5
+  %add12 = add nsw i32 %add11, %call6
+  %add13 = add nsw i32 %add12, %call7
+  %add14 = add nsw i32 %add13, %call8
+  %add15 = add nsw i32 %add14, %call9
+  ret i32 %add15
+}
+
+; Function Attrs: noinline nounwind
+define i32 @main() #2 {
+entry:
+  %call = tail call i32 @call_point()
+  %call1 = tail call i32 @call_point2()
+  ret i32 0
+}
+
+attributes #0 = { noinline nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"Snapdragon LLVM ARM Compiler 3.5 (based on LLVM 3.7.0)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
Index: test/Transforms/Hexe/analysistest2.ll
===================================================================
--- /dev/null
+++ test/Transforms/Hexe/analysistest2.ll
@@ -0,0 +1,221 @@
+; RUN: opt -S -hexe-analysis -analyze < %s | FileCheck %s
+
+; Code and Memory Reference Checks, Functions
+
+; CHECK: funny_memcpy:
+; CHECK-NEXT: InterfaceMemRefs: dst src
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: funny_memcpyi:
+; CHECK-NEXT: InterfaceMemRefs: dst src
+; CHECK-NEXT: GlobalMemRefs:
+
+; Code and Memory Reference Checks, Loops
+
+; CHECK: for.body, Parent Function: funny_memcpy:
+; CHECK-NEXT: InterfaceMemRefs: dst src
+; CHECK-NEXT: GlobalMemRefs:
+
+; CHECK: for.body, Parent Function: funny_memcpyi:
+; CHECK-NEXT: InterfaceMemRefs: dst src
+; CHECK-NEXT: GlobalMemRefs:
+
+
+; Memory Allocation Use Checks, Function Calls
+
+; CHECK: , Callee: funny_memcpy Caller: caller
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations: 	call 	call1
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: , Callee: funny_memcpy Caller: caller
+; CHECK-NEXT: GlobalAllocations: 	global
+; CHECK-NEXT: MallocAllocations: 	call
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: , Callee: funny_memcpyi Caller: caller
+; CHECK-NEXT: GlobalAllocations:
+; CHECK-NEXT: MallocAllocations: 	call2 	call3
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: , Callee: funny_memcpyi Caller: caller
+; CHECK-NEXT: GlobalAllocations: 	globali
+; CHECK-NEXT: MallocAllocations: 	call2
+; CHECK-NEXT: AllocaAllocations:
+
+; Memory Allocation Use Checks, Loops
+
+; CHECK: for.body, Parent Function: funny_memcpy:
+; CHECK-NEXT: GlobalAllocations: 	global
+; CHECK-NEXT: MallocAllocations: 	call 	call1
+; CHECK-NEXT: AllocaAllocations:
+
+; CHECK: for.body, Parent Function: funny_memcpyi:
+; CHECK-NEXT: GlobalAllocations: 	globali
+; CHECK-NEXT: MallocAllocations: 	call2 	call3
+; CHECK-NEXT: AllocaAllocations:
+
+;#include <stdlib.h>
+;
+;const unsigned N=100;
+;char global[N];
+;int globali[N];
+;
+;void funny_memcpy(char *dst, char *src, size_t N)
+;{
+;  int i;
+;
+;  for(i=0; i<N; ++i)
+;    dst[i]=src[i];
+;}
+;
+;void funny_memcpyi(int *dst, int *src, size_t N)
+;{
+;  int i;
+;
+;  for(i=0; i<N; ++i)
+;    dst[i]=src[i];
+;}
+;
+;void caller()
+;{
+;  char *src=malloc(sizeof(char)*N);
+;  char *dst=malloc(sizeof(char)*N);
+;  funny_memcpy(dst,src,N);
+;  funny_memcpy(global,src,N);
+;  free(src);
+;  free(dst);
+;
+;  int *srci=malloc(sizeof(int)*N);
+;  int *dsti=malloc(sizeof(int)*N);
+;  funny_memcpyi(dsti,srci,N);
+;  funny_memcpyi(globali,srci,N);
+;  free(srci);
+;  free(dsti);
+;}
+
+
+
+@N = constant i32 100, align 4
+@global = common global [100 x i8] zeroinitializer, align 1
+@globali = common global [100 x i32] zeroinitializer, align 4
+
+; Function Attrs: noinline nounwind
+define void @funny_memcpy(i8* nocapture %dst, i8* nocapture readonly %src, i32 %N) #0 {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.lr.ph.old
+
+for.body.lr.ph.old:                               ; preds = %entry
+  %0 = add i32 %N, -1
+  %1 = ptrtoint i8* %src to i32
+  %2 = ptrtoint i8* %dst to i32
+  %3 = icmp ugt i8* %src, %dst
+  %4 = sub i32 %1, %2
+  %5 = icmp sgt i32 %4, %N
+  %6 = or i1 %3, %5
+  br i1 %6, label %for.body.rtli, label %for.body
+
+for.body.rtli:                                    ; preds = %for.body.lr.ph.old
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %N, i32 1, i1 false)
+  br label %for.end
+
+for.body:                                         ; preds = %for.body.lr.ph.old, %for.body
+  %arrayidx.phi = phi i8* [ %arrayidx.inc, %for.body ], [ %src, %for.body.lr.ph.old ]
+  %arrayidx1.phi = phi i8* [ %arrayidx1.inc, %for.body ], [ %dst, %for.body.lr.ph.old ]
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.lr.ph.old ]
+  %7 = load i8, i8* %arrayidx.phi, align 1, !tbaa !3
+  store i8 %7, i8* %arrayidx1.phi, align 1, !tbaa !3
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %i.06, %0
+  %arrayidx.inc = getelementptr i8, i8* %arrayidx.phi, i32 1
+  %arrayidx1.inc = getelementptr i8, i8* %arrayidx1.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %for.body.rtli, %entry
+  ret void
+}
+
+; Function Attrs: noinline nounwind
+define void @funny_memcpyi(i32* nocapture %dst, i32* nocapture readonly %src, i32 %N) #0 {
+entry:
+  %dst7 = bitcast i32* %dst to i8*
+  %src8 = bitcast i32* %src to i8*
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.lr.ph.old
+
+for.body.lr.ph.old:                               ; preds = %entry
+  %0 = add i32 %N, -1
+  %1 = shl i32 %N, 2
+  %2 = ptrtoint i32* %src to i32
+  %3 = ptrtoint i32* %dst to i32
+  %4 = icmp ugt i32* %src, %dst
+  %5 = sub i32 %2, %3
+  %6 = icmp slt i32 %1, %5
+  %7 = or i1 %4, %6
+  br i1 %7, label %for.body.rtli, label %for.body
+
+for.body.rtli:                                    ; preds = %for.body.lr.ph.old
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst7, i8* %src8, i32 %1, i32 4, i1 false)
+  br label %for.end
+
+for.body:                                         ; preds = %for.body.lr.ph.old, %for.body
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %src, %for.body.lr.ph.old ]
+  %arrayidx1.phi = phi i32* [ %arrayidx1.inc, %for.body ], [ %dst, %for.body.lr.ph.old ]
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.lr.ph.old ]
+  %8 = load i32, i32* %arrayidx.phi, align 4, !tbaa !6
+  store i32 %8, i32* %arrayidx1.phi, align 4, !tbaa !6
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %i.06, %0
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %for.body.rtli, %entry
+  ret void
+}
+
+; Function Attrs: noinline nounwind
+define void @caller() #0 {
+entry:
+  %call = tail call noalias i8* @malloc(i32 100) #2
+  %call1 = tail call noalias i8* @malloc(i32 100) #2
+  tail call void @funny_memcpy(i8* %call1, i8* %call, i32 100)
+  tail call void @funny_memcpy(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @global, i32 0, i32 0), i8* %call, i32 100)
+  tail call void @free(i8* %call) #2
+  tail call void @free(i8* %call1) #2
+  %call2 = tail call noalias i8* @malloc(i32 400) #2
+  %0 = bitcast i8* %call2 to i32*
+  %call3 = tail call noalias i8* @malloc(i32 400) #2
+  %1 = bitcast i8* %call3 to i32*
+  tail call void @funny_memcpyi(i32* %1, i32* %0, i32 100)
+  tail call void @funny_memcpyi(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @globali, i32 0, i32 0), i32* %0, i32 100)
+  tail call void @free(i8* %call2) #2
+  tail call void @free(i8* %call3) #2
+  ret void
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i32) #1
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #2
+
+attributes #0 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"Snapdragon LLVM ARM Compiler 3.5 (based on LLVM 3.7.0)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"int", !4, i64 0}
Index: test/Transforms/Hexe/extracttest1.ll
===================================================================
--- /dev/null
+++ test/Transforms/Hexe/extracttest1.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S -hexe-extract -hexe-functioncalls < %s | FileCheck %s
+
+;check compact type generation
+
+; CHECK: type { i32, i32, i32*, i32 }
+; CHECK: type { i32, i32, i32*, i32*, float*, i32*, i32 }
+
+; check consturctors/destructors
+
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @hexe.constructor }]
+; CHECK: @llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @hexe.destructor }]
+
+; check control flow changes and runtime calls
+
+; CHECK: %hexe_sched_decision = call i32 @__hexe_runtime_sched(i32 0, <{ i8*, i32, i32, i8 }>* null, i32 0, %hexe_kernel_info__t* null)
+; CHECK: %hexe_sched_decision_cast = trunc i32 %hexe_sched_decision to i1
+; CHECK: call void @__hexe_enforce_coherency(i32 0, <{ i8*, i32, i32, i8 }>* %5, i32 1)
+; CHECK: %hexe_offload = call %hexe_event_t* @__hexe_dispatch(i32 0, i8* %16, i32 16)
+; CHECK: %hexe_wait = call i32 @__hexe_event_wait(%hexe_event_t* %hexe_offload)
+; CHECK: call void @__hexe_enforce_coherency(i32 0, <{ i8*, i32, i32, i8 }>* %5, i32 1)
+
+
+; check control flow changes and runtime calls
+
+; CHECK: %hexe_sched_decision1 = call i32 @__hexe_runtime_sched(i32 1, <{ i8*, i32, i32, i8 }>* null, i32 0, %hexe_kernel_info__t* null)
+; CHECK: %hexe_offload4 = call %hexe_event_t* @__hexe_dispatch(i32 1, i8* %60, i32 28)
+; CHECK: call void @__hexe_enforce_coherency(i32 1, <{ i8*, i32, i32, i8 }>* %29, i32 4)
+; CHECK: %hexe_offload10 = call %hexe_event_t* @__hexe_dispatch(i32 1, i8* %98, i32 28)
+
+; CHECK: define void @hexe.constructor() {
+; CHECK: define void @hexe.destructor() {
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "hexeextracttest--linux-gnueabi"
+
+@G = global float 2.000000e+00, align 4
+@G2 = global i32 10, align 4
+
+; Function Attrs: noinline nounwind readonly
+define i32 @add(i32 %x, i32 %y) #0 {
+entry:
+  %add = add nsw i32 %y, %x
+  %0 = load i32, i32* @G2, align 4, !tbaa !3
+  %add1 = add nsw i32 %add, %0
+  ret i32 %add1
+}
+
+; Function Attrs: noinline nounwind readonly
+define i32 @addpf(i32 %x, i32 %y, i32* nocapture readonly %i, i32* nocapture readonly %j) #0 {
+entry:
+  %add = add nsw i32 %y, %x
+  %0 = load float, float* @G, align 4, !tbaa !7
+  %conv = fptosi float %0 to i32
+  %add1 = add nsw i32 %add, %conv
+  %1 = load i32, i32* %i, align 4, !tbaa !3
+  %add2 = add nsw i32 %add1, %1
+  %2 = load i32, i32* %j, align 4, !tbaa !3
+  %add3 = add nsw i32 %add2, %2
+  %3 = load i32, i32* @G2, align 4, !tbaa !3
+  %add4 = add nsw i32 %add3, %3
+  ret i32 %add4
+}
+
+; Function Attrs: noinline nounwind
+define i32 @call_point() #1 {
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %call = tail call i32 @add(i32 10, i32 20)
+  store i32 40, i32* %a, align 4, !tbaa !3
+  store i32 50, i32* %b, align 4, !tbaa !3
+  %call1 = tail call noalias i8* @malloc(i32 4) #3
+  %0 = bitcast i8* %call1 to i32*
+  %call2 = tail call noalias i8* @malloc(i32 4) #3
+  %1 = bitcast i8* %call2 to i32*
+  store i32 100, i32* %0, align 4, !tbaa !3
+  store i32 110, i32* %1, align 4, !tbaa !3
+  %2 = load float, float* @G, align 4, !tbaa !7
+  %conv = fptosi float %2 to i32
+  %call3 = call i32 @addpf(i32 %conv, i32 100, i32* %a, i32* %b)
+  %call4 = tail call i32 @addpf(i32 10, i32 100, i32* %0, i32* %1)
+  tail call void @free(i8* %call1) #3
+  tail call void @free(i8* %call2) #3
+  %add = add nsw i32 %call3, %call
+  %add5 = add nsw i32 %add, %call4
+  ret i32 %add5
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i32) #2
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #2
+
+attributes #0 = { noinline nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"Snapdragon LLVM ARM Compiler 3.5 (based on LLVM 3.7.0)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"float", !5, i64 0}
Index: test/Transforms/Hexe/transformtest1.ll
===================================================================
--- /dev/null
+++ test/Transforms/Hexe/transformtest1.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -hexe-transform -hexe-adaptor=hexagon -hexe-adaptor-check=false < %s | FileCheck %s
+
+;check function types
+
+;CHECK: type { i8*, i32 }
+;CHECK: type { i32, i32, i32*, i32 }
+;CHECK: type { i32, i32, i32*, i32*, float*, i32*, i32 }
+
+;check generated function interface
+
+;CHECK: define internal i32 @add_compact(%0*) {
+;CHECK-NEXT: marshallingblock:
+
+;CHECK: define internal i32 @addpf_compact(%0*) {
+;CHECK-NEXT: marshallingblock:
+
+; check skel_invoke
+;CHECK: define i32 @__hexe_skel_invoke(i32, %0*) {
+;CHECK: switch i32 %4, label %5 [
+;CHECK: i32 0, label %6
+;CHECK: i32 1, label %8
+
+;CHECK: %7 = call i32 @add_compact(%0* %1)
+;CHECK: %9 = call i32 @addpf_compact(%0* %1)
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "hexe-unknown--unknown"
+
+; Function Attrs: noinline nounwind readonly
+define available_externally i32 @add(i32, i32, i32*) #0 {
+entry:
+  %add = add nsw i32 %1, %0
+  %3 = load i32, i32* %2, align 4, !tbaa !4
+  %add1 = add nsw i32 %add, %3
+  ret i32 %add1
+}
+
+; Function Attrs: noinline nounwind readonly
+define available_externally i32 @addpf(i32, i32, i32* nocapture readonly, i32* nocapture readonly, float*, i32*) #0 {
+entry:
+  %add = add nsw i32 %1, %0
+  %6 = load float, float* %4, align 4, !tbaa !8
+  %conv = fptosi float %6 to i32
+  %add1 = add nsw i32 %add, %conv
+  %7 = load i32, i32* %2, align 4, !tbaa !4
+  %add2 = add nsw i32 %add1, %7
+  %8 = load i32, i32* %3, align 4, !tbaa !4
+  %add3 = add nsw i32 %add2, %8
+  %9 = load i32, i32* %5, align 4, !tbaa !4
+  %add4 = add nsw i32 %add3, %9
+  ret i32 %add4
+}
+
+attributes #0 = { noinline nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!hexe.info = !{!0}
+!hexe.host_triple = !{!1}
+!hexe.function_list = !{!2, !3}
+
+!0 = !{i32 2}
+!1 = !{!"hexeextracttest--linux-gnueabi"}
+!2 = !{i32 (i32, i32, i32*)* @add, i32 0}
+!3 = !{i32 (i32, i32, i32*, i32*, float*, i32*)* @addpf, i32 1}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !6, i64 0}
Index: tools/opt/CMakeLists.txt
===================================================================
--- tools/opt/CMakeLists.txt
+++ tools/opt/CMakeLists.txt
@@ -15,6 +15,7 @@
   Support
   Target
   TransformUtils
+  Hexe
   Vectorize
   Passes
   )
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -317,6 +317,7 @@
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializeHexe(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeCodeGenPreparePass(Registry);