Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -86,6 +86,10 @@
 ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
+ModulePass *createAMDGPUPrintfRuntimeBinding();
+void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
+extern char &AMDGPUPrintfRuntimeBindingID;
+
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
 
Index: lib/Target/AMDGPU/AMDGPUPrinfRuntimeBinding.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPUPrinfRuntimeBinding.cpp
@@ -0,0 +1,931 @@
+//=== AMDGPUPrintfRuntimeBinding.cpp -- For openCL -- bind Printfs to a kernel arg
+//    pointer that will be bound to a buffer later by the runtime ===//
+//===----------------------------------------------------------------------===//
+// March 2014.
+//      This pass traverses the functions in the module and converts
+//      each call to printf to a sequence of operations that
+//      store the following into the printf buffer :
+//      - format string (passed as a module's metadata unique ID)
+//      - bitwise copies of printf arguments
+//      The backend passes will need to store metadata in the kernel
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "printfToRuntime"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "AMDGPU.h"
+#define DWORD_ALIGN 4
+using namespace llvm;
+
+namespace {
+class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding : public ModulePass {
+public:
+  static char ID;
+  explicit AMDGPUPrintfRuntimeBinding();
+  SmallVector<Value*, 32> printfs;
+  const char* getPassName() const;
+  bool runOnModule(Module &M);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  void getConversionSpecifiers(
+              SmallVectorImpl<char> &opConvSpecifiers,
+              StringRef fmt,
+              size_t num_ops) const;
+
+  bool shouldPrintAsStr(char Specifier, Type* OpType) const;
+  bool confirmSpirModule(Module& M) const;
+  bool confirmOpenCLVersion200(Module& M) const;
+  bool lowerPrintfForGpu(Module &M);
+  bool lowerPrintfForCpu(Module &M);
+  void collectPrintfsFromModule(Module &M);
+  std::string transPrintfVectorFormat(StringRef stref);
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  void initAnalysis(Module &M) {
+    TD = &M.getDataLayout();
+    auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  }
+
+  /// Prepare transformation.
+  /// \returns true if printf is found.
+  bool prepare(Module &M) {
+    collectPrintfsFromModule(M);
+    if (printfs.empty())
+      return false;
+    initAnalysis(M);
+    return true;
+  }
+
+  Value *simplify(Instruction *I) {
+    auto AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+        *I->getParent()->getParent());
+    return SimplifyInstruction(I, *TD, TLI, DT, AC);
+  }
+
+  const DataLayout *TD;
+  const DominatorTree *DT;
+  const TargetLibraryInfo *TLI;
+  static const int GlobalAddrspace = 1;
+};
+}
+
+char AMDGPUPrintfRuntimeBinding::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding",
+                      "AMDGPU Printf lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding",
+                    "AMDGPU Printf lowering", false, false)
+
+char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID;
+
+namespace llvm {
+ModulePass *createAMDGPUPrintfRuntimeBinding() {
+  return new AMDGPUPrintfRuntimeBinding();
+}
+}
+
+AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding()
+  : ModulePass(ID), TD(nullptr), DT(nullptr), TLI(nullptr) {
+  initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPUPrintfRuntimeBinding::confirmOpenCLVersion200(Module& M) const {
+  NamedMDNode *OCLVersion = M.getNamedMetadata("opencl.ocl.version");
+  if (!OCLVersion) {
+    return false;
+  }
+  if (OCLVersion->getNumOperands() != 1) {
+    return false;
+  }
+  MDNode *ver = OCLVersion->getOperand(0);
+  if (ver->getNumOperands() != 2) {
+    return false;
+  }
+  ConstantInt *major = mdconst::dyn_extract<ConstantInt>(ver->getOperand(0));
+  ConstantInt *minor = mdconst::dyn_extract<ConstantInt>(ver->getOperand(1));
+  if (0 == major || 0 == minor) {
+    return false;
+  }
+  if (major->getZExtValue() == 2) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers (
+  SmallVectorImpl<char> &OpConvSpecifiers,
+  StringRef Fmt, size_t NumOps) const {
+  // not all format characters are collected.
+  // At this time the format characters of interest
+  // are %p and %s, which use to know if we
+  // are either storing a literal string or a
+  // pointer to the printf buffer.
+  static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+  size_t CurFmtSpecifierIdx = 0;
+  size_t PrevFmtSpecifierIdx = 0;
+
+  while ((CurFmtSpecifierIdx
+            = Fmt.find_first_of(ConvSpecifiers, CurFmtSpecifierIdx))
+         != StringRef::npos) {
+    bool ArgDump = false;
+    StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx,
+                                  CurFmtSpecifierIdx - PrevFmtSpecifierIdx);
+    size_t pTag = CurFmt.find_last_of("%");
+    if (pTag != StringRef::npos) {
+      ArgDump = true;
+      while (pTag && CurFmt[--pTag] == '%') {
+        ArgDump = !ArgDump;
+      }
+    }
+
+    if (ArgDump) {
+      OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]);
+    }
+
+    PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx;
+  }
+}
+
+bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
+                                               Type* OpType) const {
+  if (Specifier != 's') {
+    return false;
+  }
+  const PointerType *PT = dyn_cast<PointerType>(OpType);
+  if (!PT) {
+    return false;
+  }
+  if (PT->getAddressSpace() != 2) {
+    return false;
+  }
+  Type* ElemType = PT->getContainedType(0);
+  if (ElemType->getTypeID() != Type::IntegerTyID) {
+    return false;
+  }
+  IntegerType* ElemIType = cast<IntegerType>(ElemType);
+  if (ElemIType->getBitWidth() == 8) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool AMDGPUPrintfRuntimeBinding::confirmSpirModule(Module& M) const {
+  NamedMDNode *SPIRVersion = M.getNamedMetadata("opencl.spir.version");
+  if (!SPIRVersion) return false;
+  else return true;
+}
+
+void AMDGPUPrintfRuntimeBinding::collectPrintfsFromModule(Module& M) {
+  for (Module::iterator MF = M.begin(), E = M.end(); MF != E; ++MF) {
+    if (MF->isDeclaration()) continue;
+    BasicBlock::iterator curInstr;
+    for (Function::iterator BB = MF->begin(),
+             MFE = MF->end(); BB != MFE; ++BB) {
+      for (BasicBlock::iterator instr
+             = BB->begin(), instr_end = BB->end();
+           instr != instr_end; ) {
+        CallInst *CI = dyn_cast<CallInst>(instr);
+        curInstr = instr;
+        instr++;
+        if (CI && CI->getCalledFunction()
+            && CI->getCalledFunction()->getName() == "printf") {
+          printfs.push_back(CI);
+        }
+      }
+    }
+  }
+}
+
+std::string AMDGPUPrintfRuntimeBinding::transPrintfVectorFormat(StringRef str) {
+  SmallVector<StringRef, 32> opndModifiers;
+  std::string fmt(str);
+  size_t curFmtSpecifierIdx = 0;
+  size_t nextFmtSpecifierIdx = 0;
+  size_t vecFmtSpecifierIdx = 0;
+  bool isVectorFormat = false;
+  static const char convSpecifiers[] = "cdieEfgGaosuxXp";
+  curFmtSpecifierIdx = fmt.find_first_of('%',curFmtSpecifierIdx);
+  std::string transFmt = fmt.substr(0,curFmtSpecifierIdx);
+  opndModifiers.push_back("");
+  while (curFmtSpecifierIdx != std::string::npos) {
+    nextFmtSpecifierIdx = fmt.find_first_of("%",curFmtSpecifierIdx + 1);
+    std::string curFmt;
+    if (nextFmtSpecifierIdx != std::string::npos) {
+      curFmt = fmt.substr(curFmtSpecifierIdx,
+                          nextFmtSpecifierIdx - curFmtSpecifierIdx);
+    }
+    else {
+      curFmt = fmt.substr(curFmtSpecifierIdx);
+    }
+    size_t convSpecifierIdx;
+    //get modifier and store it in the opndModifiers
+    if ((convSpecifierIdx =
+         curFmt.find_first_of(convSpecifiers)) !=  std::string::npos) {
+      if (curFmt[convSpecifierIdx - 1] == 'h') {
+        if (convSpecifierIdx > 1 && curFmt[convSpecifierIdx - 2] == 'h') {
+          opndModifiers.push_back("hh");
+        } else {
+          opndModifiers.push_back("h");
+
+        }
+      } else if (curFmt[convSpecifierIdx - 1] == 'l') {
+        if (convSpecifierIdx > 1 && curFmt[convSpecifierIdx - 2] == 'h') {
+          opndModifiers.push_back("hl");
+        } else {
+          opndModifiers.push_back("l");
+        }
+      } else {
+        opndModifiers.push_back("");
+      }
+    }
+    std::string compFmt;
+    vecFmtSpecifierIdx = 0;
+    // Check if the vector should be printed:
+    // one of "v16",v2,"v3",v4","v8" indicate
+    // to vector convension specifier and its elemnts count.
+    while ((vecFmtSpecifierIdx = curFmt.find_first_of('v',vecFmtSpecifierIdx))
+           != std::string::npos) {
+      isVectorFormat = true;
+      char elmCount = 0;
+      char elmFieldSize = 0;
+      if ((vecFmtSpecifierIdx + 1) < curFmt.length()) {
+        elmCount = curFmt[vecFmtSpecifierIdx + 1];
+        if ((elmCount == '1') && ((vecFmtSpecifierIdx + 2) < curFmt.length())
+            && (curFmt[vecFmtSpecifierIdx + 2] == '6')) {
+          elmCount = 16;
+          elmFieldSize = 2;
+        } else if ((('2' <= elmCount) && (elmCount <= '4'))
+                   || (elmCount == '8')) {
+          elmCount -= '0';
+          elmFieldSize = 1;
+        }
+        else {
+          // If there is no element count after 'v',
+          // continue to look for valid vector specifier.
+          elmCount = 0;
+          ++vecFmtSpecifierIdx;
+          continue;
+        }
+        // Rebuild the format to contain the
+        // convension specifier to each of vector elements.
+        if (elmCount) {
+          std::string fmtSuffix;
+          convSpecifierIdx = curFmt.find_first_of(
+            convSpecifiers,vecFmtSpecifierIdx + 1);
+          if (curFmt.length() - 1 != convSpecifierIdx) {
+            compFmt = curFmt.substr(convSpecifierIdx + 1);
+            curFmt = curFmt.erase(convSpecifierIdx + 1);
+          }
+          if (nextFmtSpecifierIdx == std::string::npos) {
+            fmtSuffix = curFmt.substr(convSpecifierIdx + 1);
+            curFmt.erase(convSpecifierIdx + 1);
+          }
+          if (!curFmt.empty()) {
+            // If long value is represented by 4 bytes
+            // and that llvm long value is represented by 64-bit,
+            // the string format should be converted to have
+            // "ll" modifier.
+            if (opndModifiers.back() == "l" && sizeof(long) == 4)
+              curFmt.insert(convSpecifierIdx - 1, "l");
+            curFmt.erase(vecFmtSpecifierIdx, elmFieldSize+1);
+            // Donot need "hl" modifier for vector arguments formats.
+            if (opndModifiers.back() == "hl") {
+              curFmt.erase(curFmt.find_first_of("hl"),2);
+            }
+            for (char i = 0; i < elmCount - 1; ++i) {
+              transFmt = transFmt + curFmt + ",";
+            }
+          }
+          else {
+            curFmt = fmtSuffix;
+            break;
+          }
+          if (!fmtSuffix.empty()) {
+            curFmt += fmtSuffix;
+          }
+        }
+      }
+    }
+    transFmt += curFmt;
+    if (!compFmt.empty()) {
+      transFmt += compFmt;
+    }
+    curFmtSpecifierIdx = nextFmtSpecifierIdx;
+  }
+  return transFmt;
+}
+
+bool AMDGPUPrintfRuntimeBinding::lowerPrintfForCpu(Module &M) {
+  for (SmallVectorImpl<Value*>::iterator
+         print_iterate = printfs.begin(),
+            print_iterate_e = printfs.end();
+       print_iterate != print_iterate_e;
+       ++print_iterate) {
+    CallInst* CI = dyn_cast<CallInst>( *print_iterate);
+
+    SmallString<16> opConvSpecifiers;
+    Value *op = CI->getArgOperand(0);
+    if (auto I = dyn_cast<Instruction>(op))
+      op = simplify(I);
+    ConstantExpr *const_expr = dyn_cast<ConstantExpr>(op);
+
+    if (const_expr) {
+      GlobalVariable *GVar = dyn_cast<GlobalVariable>(
+            const_expr->getOperand(0));
+
+      if (GVar && GVar->hasInitializer()) {
+        ConstantDataArray *CA = dyn_cast<ConstantDataArray>(
+              GVar->getInitializer());
+        if (CA->isString()) {
+          StringRef str("unknown");
+          str = CA->getAsCString();
+          DEBUG(dbgs() << "Processing cpu printf format = "
+                << str.str() << '\n');
+          std::string trans = transPrintfVectorFormat(str);
+          if (trans != str) {
+            Constant *fmtStrArray =
+              ConstantDataArray::getString(M.getContext(), trans.c_str(), true);
+            GlobalVariable* newfmt = new GlobalVariable(M,
+                                      fmtStrArray->getType(),
+                                      true,
+                                      GlobalValue::ExternalLinkage,
+                                      fmtStrArray, "fmtPrintf",
+                                      NULL, GlobalVariable::NotThreadLocal,
+                                      GVar->getType()->getAddressSpace());
+            DEBUG(dbgs() << "Format after expanding vectors = "
+                  << *newfmt << '\n');
+            Constant* ncexp = ConstantExpr::getBitCast(newfmt,
+                                const_expr->getType());
+            const_expr->replaceAllUsesWith(ncexp);
+            if (CI->getNumArgOperands() > 1 ) {
+              SmallVector<Value*, 32> callargs;
+              callargs.push_back(ncexp);
+              bool callFix = false;
+              Type *I32Ty = Type::getInt32Ty(M.getContext());
+              for (unsigned argcount = 1;
+                   argcount < CI->getNumArgOperands();
+                   argcount++) {
+                Value *arg = CI->getArgOperand(argcount);
+                Type *argtype = arg->getType();
+                if (argtype->getTypeID() == Type::VectorTyID) {
+                  callFix = true;
+                  uint32_t elemSize =
+                    cast<VectorType>(arg->getType())->getNumElements();
+                  DEBUG(dbgs() << "Need to extract printf vector = "
+                        << *arg << '\n');
+                  for (uint32_t idxv = 0; idxv < elemSize; ++idxv) {
+                    Value* extr = ExtractElementInst::Create(
+                                 arg, ConstantInt::get(I32Ty, idxv, false),
+                                 "printfvecext", CI);
+                    DEBUG(dbgs() << "printf vector extract = " <<
+                          *extr << '\n');
+                    DEBUG(dbgs() << "extract's type = "
+                          << *extr->getType() << '\n');
+                    if (argtype->getScalarType()->isFloatTy() ||
+                        argtype->getScalarType()->isHalfTy()) {
+                      Type *doublety = Type::getDoubleTy(M.getContext());
+                      extr = CastInst::CreateFPCast(extr, doublety,
+                                          "defArgPromPrintfVec", CI);
+                      DEBUG(dbgs() << "FPext ins = " << *extr << '\n');
+                    }
+                    callargs.push_back(extr);
+                  }
+                } else {
+                  DEBUG(dbgs() << "nonvector = " << *arg << '\n');
+                  callargs.push_back(arg);
+                }
+              }
+              if (callFix) {
+                DEBUG(dbgs() << "printf function signature = "
+                      << *CI->getCalledFunction() << '\n');
+                CallInst *newprintf = CallInst::Create(CI->getCalledFunction(),
+                                                    callargs, "printf_", CI);
+                DEBUG(dbgs() << "Before transformation of vector = "
+                      << *CI << '\n');
+                DEBUG(dbgs() << "Now = " << *newprintf << '\n');
+                CI->eraseFromParent();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(Module &M) {
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+  unsigned UniqID = 0;
+  // NB: This is important for this string size to be divizable by 4
+  const char non_literal_str[4] = "???";
+
+  for (SmallVectorImpl<Value*>::iterator
+         print_iterate = printfs.begin(),
+            print_iterate_e = printfs.end();
+       print_iterate != print_iterate_e;
+       ++print_iterate) {
+    CallInst* CI = dyn_cast<CallInst>( *print_iterate);
+
+    unsigned num_ops = CI->getNumArgOperands();
+
+    SmallString<16> opConvSpecifiers;
+    Value *op = CI->getArgOperand(0);
+    if (auto I = dyn_cast<Instruction>(op))
+      op = simplify(I);
+
+    ConstantExpr *const_expr = dyn_cast<ConstantExpr>(op);
+
+    if (const_expr) {
+      GlobalVariable *GVar = dyn_cast<GlobalVariable>(
+            const_expr->getOperand(0));
+
+      StringRef str("unknown");
+      if (GVar && GVar->hasInitializer()) {
+        ConstantDataArray *CA = dyn_cast<ConstantDataArray>(
+              GVar->getInitializer());
+        if (CA->isString()) {
+          str = CA->getAsCString();
+        }
+        //
+        // we need this call to ascertain
+        // that we are printing a string
+        // or a pointer. It takes out the
+        // specifiers and fills up the first
+        // arg
+        getConversionSpecifiers( opConvSpecifiers, str, num_ops - 1);
+      }
+      // Add metadata for the string
+      std::string astreamholder;
+      raw_string_ostream sizes(astreamholder);
+      int sum = DWORD_ALIGN;
+      sizes << CI->getNumArgOperands() -1;
+      sizes << ':';
+      for (unsigned argcount = 1;
+           argcount < CI->getNumArgOperands()
+             && argcount <= opConvSpecifiers.size();
+           argcount++) {
+        Value *arg = CI->getArgOperand(argcount);
+        Type *argtype = arg->getType();
+        unsigned argsize = TD->getTypeAllocSizeInBits(argtype);
+        argsize = argsize/8;
+        //
+        // ArgSize by design should be a multiple of DWORD_ALIGN,
+        // expand the arguments that do not follow this rule.
+        //
+        if (argsize % DWORD_ALIGN != 0) {
+          llvm::Type* resType = llvm::Type::getInt32Ty(Ctx);
+          VectorType* llvmVecType = llvm::dyn_cast<llvm::VectorType>(argtype);
+          int numEle = llvmVecType ? llvmVecType->getNumElements() : 1;
+          if (llvmVecType && numEle > 1)
+            resType = llvm::VectorType::get(resType, numEle);//static_cast<int>(numEle));
+          Builder.SetInsertPoint(CI);
+          Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+          if (opConvSpecifiers[argcount - 1] == 'x' ||
+            opConvSpecifiers[argcount - 1] == 'X' ||
+            opConvSpecifiers[argcount - 1] == 'u' ||
+            opConvSpecifiers[argcount - 1] == 'o')
+            arg = Builder.CreateZExt(arg, resType);
+          else
+            arg = Builder.CreateSExt(arg, resType);
+          argtype = arg->getType();
+          argsize = TD->getTypeAllocSizeInBits(argtype);
+          argsize = argsize / 8;
+          CI->setOperand(argcount, arg);
+        }
+        if (opConvSpecifiers[argcount - 1] == 'f') {
+          ConstantFP *fpCons = dyn_cast<ConstantFP>(arg);
+          if (fpCons)
+            argsize = 4;
+          else {
+            FPExtInst *fpext = dyn_cast<FPExtInst>(arg);
+            if (fpext && fpext->getType()->isDoubleTy() &&
+                fpext->getOperand(0)->getType()->isFloatTy())
+              argsize = 4;
+          }
+        }
+        if (shouldPrintAsStr(opConvSpecifiers[argcount - 1], argtype)) {
+          if (ConstantExpr *strC = dyn_cast<ConstantExpr>(arg)) {
+            GlobalVariable *strG
+              = dyn_cast<GlobalVariable>(strC->getOperand(0));
+            if (strG && strG->hasInitializer()) {
+              Constant *Init = strG->getInitializer();
+              ConstantDataArray *strCA = dyn_cast<ConstantDataArray>(Init);
+              if (Init->isZeroValue() || strCA->isString()) {
+                size_t size_str = Init->isZeroValue() ? 1 :
+                                    (strlen(strCA->getAsCString().data()) + 1);
+                size_t rem = size_str % DWORD_ALIGN;
+                size_t nsize_str = 0;
+                DEBUG(dbgs() << "Printf string original size = " << size_str << '\n');
+                if (rem) {
+                  nsize_str = size_str + (DWORD_ALIGN - rem);
+                } else {
+                  nsize_str = size_str;
+                }
+                argsize = nsize_str;
+              }
+            } else {
+              argsize = sizeof(non_literal_str);
+            }
+          } else {
+            argsize = sizeof(non_literal_str);
+          }
+        }
+        DEBUG(dbgs() << "Printf argsize (in buffer) = "
+              << argsize << " for type: " << *argtype << '\n');
+        sizes << argsize << ':';
+        sum += argsize;
+      }
+      DEBUG(dbgs() << "Printf format string in source = "
+                   << str.str() << '\n');
+      for (size_t i = 0; i < str.size(); ++i) {
+        // Rest of the C escape sequences (e.g. \') are handled correctly
+        // by the MDParser
+        switch (str[i]) {
+        case '\a':
+          sizes << "\\a";
+          break;
+        case '\b':
+          sizes << "\\b";
+          break;
+        case '\f':
+          sizes << "\\f";
+          break;
+        case '\n':
+          sizes << "\\n";
+          break;
+        case '\r':
+          sizes << "\\r";
+          break;
+        case '\v':
+          sizes << "\\v";
+          break;
+        case ':':
+          // ':' cannot be scanned by Flex, as it is defined as a delimiter
+          // Replace it with it's octal representation \72
+          sizes << "\\72";
+          break;
+        default:
+          sizes << str[i];
+          break;
+        }
+      }
+
+      // Insert the printf_alloc call
+      Builder.SetInsertPoint(CI);
+      Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+      AttributeSet attr = AttributeSet::get(Ctx, AttributeSet::FunctionIndex,
+                                            Attribute::NoUnwind);
+
+      Type *sizetTy = Type::getInt32Ty(Ctx);
+
+      Type *Tys_alloc[1] = { sizetTy };
+      Type *I8Ptr = PointerType::get( Type::getInt8Ty(Ctx), 1);
+      FunctionType *FTy_alloc
+        = FunctionType::get( I8Ptr, Tys_alloc, false);
+      Constant *printf_alloc_fn
+        = M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, attr);
+      Function *afn = dyn_cast<Function>(printf_alloc_fn);
+      afn->setCallingConv(llvm::CallingConv::SPIR_FUNC);
+      DEBUG(dbgs() << "inserting printf_alloc decl, an extern @ pre-link:");
+      DEBUG(dbgs() << *afn);
+
+      DEBUG(dbgs() << "Printf metadata = " << sizes.str() << '\n');
+      std::string fmtstr = itostr(++UniqID) + ":" + sizes.str().c_str();
+      MDString *fmtStrArray
+        = MDString::get( Ctx, fmtstr );
+
+
+      // Instead of creating global variables, the
+      // printf format strings are extracted
+      // and passed as metadata. This avoids
+      // polluting llvm's symbol tables in this module.
+      // Metadata is going to be extracted
+      // by the backend passes and inserted
+      // into the OpenCL binary as appropriate.
+      StringRef amd("llvm.printf.fmts");
+      NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd);
+      MDNode *myMD = MDNode::get(Ctx,fmtStrArray);
+      metaD->addOperand(myMD);
+      Value *sumC = ConstantInt::get( sizetTy, sum, false);
+      SmallVector<Value*,1> alloc_args;
+      alloc_args.push_back(sumC);
+      CallInst *pcall = CallInst::Create( afn, alloc_args,
+                                         "printf_alloc_fn", CI);
+      pcall->setCallingConv(llvm::CallingConv::SPIR_FUNC);
+
+      //
+      // Insert code to split basicblock with a
+      // piece of hammock code.
+      // basicblock splits after buffer overflow check
+      //
+      ConstantPointerNull *zeroIntPtr
+        = ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx),
+            1));
+      ICmpInst *cmp
+        = dyn_cast<ICmpInst>(
+            Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
+      if (!CI->use_empty()) {
+        Value *result = Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty,
+                                           "printf_res");
+        CI->replaceAllUsesWith(result);
+      }
+      SplitBlock(CI->getParent(), cmp);
+      TerminatorInst *brnch
+        = SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false);
+
+      Builder.SetInsertPoint(brnch);
+
+      // store unique printf id in the buffer
+      //
+      SmallVector<Value*, 1> ZeroIdxList;
+      ConstantInt* zeroInt
+        = ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
+      ZeroIdxList.push_back(zeroInt);
+
+      GetElementPtrInst *buffer_idx
+        = dyn_cast<GetElementPtrInst>(
+            GetElementPtrInst::Create(nullptr,
+            pcall, ZeroIdxList, "PrintBuffID", brnch));
+
+      Type *idPointer
+        = PointerType::get(I32Ty, GlobalAddrspace);
+      Value *id_gep_cast
+        = new BitCastInst( buffer_idx, idPointer,
+                           "PrintBuffIdCast", brnch);
+
+      StoreInst* stbuff
+        = new StoreInst( ConstantInt::get(I32Ty, UniqID), id_gep_cast);
+      stbuff->insertBefore(brnch); // to remove unused variable warning
+
+      SmallVector<Value*,2> FourthIdxList;
+      ConstantInt* fourInt
+        = ConstantInt::get(Ctx, APInt(
+            32, StringRef("4"), 10));
+
+      FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
+      // the following GEP is the buffer pointer
+      buffer_idx
+        = cast<GetElementPtrInst>(GetElementPtrInst::Create(nullptr,
+              pcall, FourthIdxList, "PrintBuffGep", brnch));
+
+      Type* Int32Ty = Type::getInt32Ty(Ctx);
+      Type* Int64Ty = Type::getInt64Ty(Ctx);
+      for (unsigned argcount = 1;
+           argcount < CI->getNumArgOperands()
+             && argcount <= opConvSpecifiers.size();
+           argcount++) {
+        Value *arg = CI->getArgOperand(argcount);
+        Type *argType = arg->getType();
+        SmallVector<Value*,32> whatToStore;
+        if (argType->isFPOrFPVectorTy()
+              && (argType->getTypeID() != Type::VectorTyID)) {
+          Type *iType = (argType->isFloatTy()) ?  Int32Ty : Int64Ty;
+          if (opConvSpecifiers[argcount - 1] == 'f') {
+            ConstantFP *fpCons = dyn_cast<ConstantFP>(arg);
+            if (fpCons) {
+              APFloat Val(fpCons->getValueAPF());
+              bool lost = false;
+              Val.convert(APFloat::IEEEsingle,
+                          APFloat::rmNearestTiesToEven,
+                          &lost);
+              arg = ConstantFP::get(Ctx, Val);
+              iType = Int32Ty;
+            } else {
+              FPExtInst *fpext = dyn_cast<FPExtInst>(arg);
+              if (fpext && fpext->getType()->isDoubleTy()
+                  && fpext->getOperand(0)->getType()->isFloatTy()) {
+                arg = fpext->getOperand(0);
+                iType = Int32Ty;
+              }
+            }
+          }
+          arg = new BitCastInst(arg, iType, "PrintArgFP", brnch);
+          whatToStore.push_back(arg);
+        } else if (argType->getTypeID() == Type::PointerTyID) {
+          if (shouldPrintAsStr(opConvSpecifiers[argcount - 1], argType)) {
+            const char *s = non_literal_str;
+            if (ConstantExpr *strC = dyn_cast<ConstantExpr>(arg)) {
+              GlobalVariable *strG
+                = dyn_cast<GlobalVariable>(strC->getOperand(0));
+              if (strG && strG->hasInitializer()) {
+                Constant *Init = strG->getInitializer();
+                ConstantDataArray *strCA = dyn_cast<ConstantDataArray>(Init);
+                if (Init->isZeroValue() || strCA->isString()) {
+                  s = Init->isZeroValue() ? "" : strCA->getAsCString().data();
+                }
+              }
+            }
+            size_t size_str = strlen(s) + 1;
+            size_t rem = size_str % DWORD_ALIGN;
+            size_t nsize_str = 0;
+            if (rem) {
+              nsize_str = size_str + (DWORD_ALIGN - rem);
+            } else {
+              nsize_str = size_str;
+            }
+            if (s[0]) {
+              char *mynewstr = new char[nsize_str]();
+              strcpy(mynewstr, s);
+              int numints = nsize_str/4;
+              int charc = 0;
+              while(numints) {
+                int anum = *(int*)(mynewstr+charc);
+                charc += 4;
+                numints--;
+                Value *anumV = ConstantInt::get( Int32Ty, anum, false);
+                whatToStore.push_back(anumV);
+              }
+              delete mynewstr;
+            } else {
+              // Empty string, give a hint to RT it is no NULL
+              Value *anumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false);
+              whatToStore.push_back(anumV);
+            }
+          } else {
+            uint64_t Size = TD->getTypeAllocSizeInBits(argType);
+            assert((Size == 32 || Size == 64) && "unsupported size");
+            Type* DstType = (Size == 32) ? Int32Ty : Int64Ty;
+            arg = new PtrToIntInst(arg, DstType,
+                                    "PrintArgPtr", brnch);
+            whatToStore.push_back(arg);
+          }
+        } else if (argType->getTypeID() == Type::VectorTyID) {
+          Type *iType = NULL;
+          uint32_t eleCount = cast<VectorType>(argType)->getNumElements();
+          uint32_t eleSize = argType->getScalarSizeInBits();
+          uint32_t totalSize = eleCount * eleSize;
+          if (eleCount == 3) {
+            IntegerType *int32ty
+              = Type::getInt32Ty(argType->getContext());
+            Constant* indices[4]
+              = { ConstantInt::get(int32ty, 0),
+                  ConstantInt::get(int32ty, 1),
+                  ConstantInt::get(int32ty, 2),
+                  ConstantInt::get(int32ty, 2)
+                };
+            Constant* mask = ConstantVector::get(indices);
+            ShuffleVectorInst* shuffle
+              = new ShuffleVectorInst(arg, arg, mask);
+            shuffle->insertBefore(brnch);
+            arg = shuffle;
+            argType = arg->getType();
+            totalSize += eleSize;
+          }
+          switch (eleSize) {
+            default:
+              eleCount = totalSize / 64;
+              iType = dyn_cast<Type>(
+                        Type::getInt64Ty(
+                          argType->getContext()));
+              break;
+            case 8:
+              if (eleCount >= 8) {
+                eleCount = totalSize / 64;
+                iType = dyn_cast<Type>(
+                          Type::getInt64Ty(
+                            argType->getContext()));
+              } else if (eleCount >= 3) {
+                eleCount = 1;
+                iType = dyn_cast<Type>(
+                          Type::getInt32Ty(
+                            argType->getContext()));
+              } else {
+                eleCount = 1;
+                iType = dyn_cast<Type>(
+                          Type::getInt16Ty(
+                           argType->getContext()));
+              }
+              break;
+            case 16:
+              if (eleCount >= 3) {
+                eleCount = totalSize / 64;
+                iType = dyn_cast<Type>(
+                          Type::getInt64Ty(
+                            argType->getContext()));
+              } else {
+                eleCount = 1;
+                iType = dyn_cast<Type>(
+                          Type::getInt32Ty(
+                            argType->getContext()));
+              }
+              break;
+          }
+          if (eleCount > 1) {
+            iType = dyn_cast<Type>(
+                      VectorType::get(
+                        iType, eleCount));
+          }
+          arg = new BitCastInst(arg, iType, "PrintArgVect", brnch);
+          whatToStore.push_back(arg);
+        } else {
+          whatToStore.push_back(arg);
+        }
+
+        for ( SmallVectorImpl<Value*>::iterator
+               w_iterate = whatToStore.begin(),
+               w_iterate_e = whatToStore.end();
+               w_iterate != w_iterate_e; ) {
+          Value* thebtcast = *w_iterate;
+          unsigned argsize
+            = TD->getTypeAllocSizeInBits(thebtcast->getType())/8;
+          SmallVector<Value*,1> buffOffset;
+          buffOffset.push_back(
+            ConstantInt::get( I32Ty, argsize));
+
+          Type *argPointer
+            = PointerType::get( thebtcast->getType(), 1);
+          Value *casted_gep
+            = new BitCastInst( buffer_idx, argPointer,
+                                 "PrintBuffPtrCast", brnch);
+          StoreInst* stbuff
+            = new StoreInst(
+                    thebtcast, casted_gep, brnch);
+          DEBUG(dbgs() << "inserting store to printf buffer:\n"
+                       << *stbuff << '\n');
+          ++w_iterate;
+          if (w_iterate == w_iterate_e
+              && argcount+1 == CI->getNumArgOperands())
+            break;
+          buffer_idx
+              = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+                    nullptr, buffer_idx, buffOffset, "PrintBuffNextPtr", brnch));
+          DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
+                       << *buffer_idx << '\n');
+        }
+      }
+    }
+  }
+  //erase the printf calls
+  for (SmallVectorImpl<Value*>::iterator
+         print_iterate = printfs.begin(),
+            print_iterate_e = printfs.end();
+       print_iterate != print_iterate_e;
+       ++print_iterate) {
+    CallInst* CI
+      = dyn_cast<CallInst>( *print_iterate);
+    CI->eraseFromParent();
+  }
+  return true;
+}
+
+static bool isX86Triple(const llvm::Triple &Triple) {
+  return Triple.getArch() == llvm::Triple::x86
+    || Triple.getArch() == llvm::Triple::x86_64;
+}
+
+bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+  if (isX86Triple(Triple(M.getTargetTriple()))) {
+    if (!prepare(M))
+      return false;
+    return lowerPrintfForCpu(M);
+  } else {
+    if (!prepare(M))
+      return false;
+    return lowerPrintfForGpu(M);
+  }
+}
+
+const char* AMDGPUPrintfRuntimeBinding::getPassName() const {
+  return "AMD Printf lowering part 1";
+}
+
+bool AMDGPUPrintfRuntimeBinding::doInitialization(Module &M) {
+  return false;
+}
+
+bool AMDGPUPrintfRuntimeBinding::doFinalization(Module &M) {
+  return false;
+}
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -50,6 +50,7 @@
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+  virtual void addPreLinkPasses(PassManagerBase &) override;
 };
 
 //===----------------------------------------------------------------------===//
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/IR/LegacyPassManager.h"
 
 using namespace llvm;
 
@@ -82,6 +83,7 @@
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
+  initializeAMDGPUPrintfRuntimeBindingPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -166,6 +168,11 @@
     FSAttr.getValueAsString();
 }
 
+void AMDGPUTargetMachine::addPreLinkPasses(PassManagerBase &PM) {
+  PM.add(llvm::createAMDGPUPrintfRuntimeBinding());
+}
+
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  AMDGPUPrinfRuntimeBinding.cpp
   GCNHazardRecognizer.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
Index: test/CodeGen/AMDGPU/printf.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/printf.ll
@@ -0,0 +1,59 @@
+; RUN: opt -mtriple=amdgcn--amdhsa -amdgpu-printf-runtime-binding -mcpu=fiji -S < %s | FileCheck %s
+; CHECK-LABEL: entry
+; CHECK: call spir_func i8 addrspace(1)* @__printf_alloc
+; CHECK-LABEL: entry.split
+; CHECK: icmp ne i8 addrspace(1)* %printf_alloc_fn, null
+; CHECK: %PrintBuffID = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 0
+; CHECK: %PrintBuffIdCast = bitcast i8 addrspace(1)* %PrintBuffID to i32 addrspace(1)*
+; CHECK: store i32 1, i32 addrspace(1)* %PrintBuffIdCast
+; CHECK: %PrintBuffGep = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 4
+; CHECK: %PrintArgPtr = ptrtoint i8* %arraydecay to i64
+; CHECK: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffGep to i64 addrspace(1)*
+; CHECK: store i64 %PrintArgPtr, i64 addrspace(1)* %PrintBuffPtrCast
+; CHECK: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i32 8
+; CHECK: %PrintBuffPtrCast1 = bitcast i8 addrspace(1)* %PrintBuffNextPtr to i32 addrspace(1)*
+; CHECK: store i32 %3, i32 addrspace(1)* %PrintBuffPtrCast1
+
+@test_kernel.str = private unnamed_addr constant [9 x i8] c"globalid\00", align 1
+@.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1
+
+define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %in.addr = alloca i32 addrspace(1)*, align 4
+  %out.addr = alloca i32 addrspace(1)*, align 4
+  %n = alloca i32, align 4
+  %str = alloca [9 x i8], align 1
+  store i32 addrspace(1)* %in, i32 addrspace(1)** %in.addr, align 4
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 4
+  %0 = bitcast i32* %n to i8*
+  %call = call i64 @_Z13get_global_idj(i32 0) #5
+  %conv = trunc i64 %call to i32
+  store i32 %conv, i32* %n, align 4
+  %1 = bitcast [9 x i8]* %str to i8*
+  %2 = bitcast [9 x i8]* %str to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @test_kernel.str, i32 0, i32 0), i64 9, i32 1, i1 false)
+  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %str, i32 0, i32 0
+  %3 = load i32, i32* %n, align 4
+  %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %3)
+  %4 = load i32, i32* %n, align 4
+  %idxprom = sext i32 %4 to i64
+  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %in.addr, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom
+  %6 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %7 = load i32, i32* %n, align 4
+  %idxprom2 = sext i32 %7 to i64
+  %8 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom2
+  store i32 %6, i32 addrspace(1)* %arrayidx3, align 4
+  %9 = bitcast [9 x i8]* %str to i8*
+  %10 = bitcast i32* %n to i8*
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @_Z13get_global_idj(i32) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+declare i32 @printf(i8 addrspace(2)*, ...) #3