Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -86,6 +86,10 @@ ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); +ModulePass *createAMDGPUPrintfRuntimeBinding(); +void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&); +extern char &AMDGPUPrintfRuntimeBindingID; + void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; Index: lib/Target/AMDGPU/AMDGPUPrinfRuntimeBinding.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUPrinfRuntimeBinding.cpp @@ -0,0 +1,931 @@ +//=== AMDGPUPrintfRuntimeBinding.cpp -- For openCL -- bind Printfs to a kernel arg +// pointer that will be bound to a buffer later by the runtime ===// +//===----------------------------------------------------------------------===// +// March 2014. +// This pass traverses the functions in the module and converts +// each call to printf to a sequence of operations that +// store the following into the printf buffer : +// - format string (passed as a module's metadata unique ID) +// - bitwise copies of printf arguments +// The backend passes will need to store metadata in the kernel +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "printfToRuntime" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Type.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "AMDGPU.h" +#define DWORD_ALIGN 4 +using namespace llvm; + +namespace { +class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding : public ModulePass { +public: + static char ID; + explicit AMDGPUPrintfRuntimeBinding(); + SmallVector printfs; + const char* getPassName() const; + bool runOnModule(Module &M); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + void getConversionSpecifiers( + SmallVectorImpl &opConvSpecifiers, + StringRef fmt, + size_t num_ops) const; + + bool shouldPrintAsStr(char Specifier, Type* OpType) const; + bool confirmSpirModule(Module& M) const; + bool confirmOpenCLVersion200(Module& M) const; + bool lowerPrintfForGpu(Module &M); + bool lowerPrintfForCpu(Module &M); + void collectPrintfsFromModule(Module &M); + std::string transPrintfVectorFormat(StringRef stref); +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + void initAnalysis(Module &M) { + TD = &M.getDataLayout(); + auto DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + TLI = &getAnalysis().getTLI(); + } + + /// Prepare transformation. + /// \returns true if printf is found. + bool prepare(Module &M) { + collectPrintfsFromModule(M); + if (printfs.empty()) + return false; + initAnalysis(M); + return true; + } + + Value *simplify(Instruction *I) { + auto AC = &getAnalysis().getAssumptionCache( + *I->getParent()->getParent()); + return SimplifyInstruction(I, *TD, TLI, DT, AC); + } + + const DataLayout *TD; + const DominatorTree *DT; + const TargetLibraryInfo *TLI; + static const int GlobalAddrspace = 1; +}; +} + +char AMDGPUPrintfRuntimeBinding::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding", + "AMDGPU Printf lowering", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding", + "AMDGPU Printf lowering", false, false) + +char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID; + +namespace llvm { +ModulePass *createAMDGPUPrintfRuntimeBinding() { + return new AMDGPUPrintfRuntimeBinding(); +} +} + +AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() + : ModulePass(ID), TD(nullptr), DT(nullptr), TLI(nullptr) { + initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry()); +} + +bool AMDGPUPrintfRuntimeBinding::confirmOpenCLVersion200(Module& M) const { + NamedMDNode *OCLVersion = M.getNamedMetadata("opencl.ocl.version"); + if (!OCLVersion) { + return false; + } + if (OCLVersion->getNumOperands() != 1) { + return false; + } + MDNode *ver = OCLVersion->getOperand(0); + if (ver->getNumOperands() != 2) { + return false; + } + ConstantInt *major = mdconst::dyn_extract(ver->getOperand(0)); + ConstantInt *minor = mdconst::dyn_extract(ver->getOperand(1)); + if (0 == major || 0 == minor) { + return false; + } + if (major->getZExtValue() == 2) { + return true; + } else { + return false; + } +} + +void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers ( + SmallVectorImpl &OpConvSpecifiers, + StringRef Fmt, size_t NumOps) const { + // not all format characters are collected. + // At this time the format characters of interest + // are %p and %s, which use to know if we + // are either storing a literal string or a + // pointer to the printf buffer. + static const char ConvSpecifiers[] = "cdieEfgGaosuxXp"; + size_t CurFmtSpecifierIdx = 0; + size_t PrevFmtSpecifierIdx = 0; + + while ((CurFmtSpecifierIdx + = Fmt.find_first_of(ConvSpecifiers, CurFmtSpecifierIdx)) + != StringRef::npos) { + bool ArgDump = false; + StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx, + CurFmtSpecifierIdx - PrevFmtSpecifierIdx); + size_t pTag = CurFmt.find_last_of("%"); + if (pTag != StringRef::npos) { + ArgDump = true; + while (pTag && CurFmt[--pTag] == '%') { + ArgDump = !ArgDump; + } + } + + if (ArgDump) { + OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]); + } + + PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx; + } +} + +bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier, + Type* OpType) const { + if (Specifier != 's') { + return false; + } + const PointerType *PT = dyn_cast(OpType); + if (!PT) { + return false; + } + if (PT->getAddressSpace() != 2) { + return false; + } + Type* ElemType = PT->getContainedType(0); + if (ElemType->getTypeID() != Type::IntegerTyID) { + return false; + } + IntegerType* ElemIType = cast(ElemType); + if (ElemIType->getBitWidth() == 8) { + return true; + } else { + return false; + } +} + +bool AMDGPUPrintfRuntimeBinding::confirmSpirModule(Module& M) const { + NamedMDNode *SPIRVersion = M.getNamedMetadata("opencl.spir.version"); + if (!SPIRVersion) return false; + else return true; +} + +void AMDGPUPrintfRuntimeBinding::collectPrintfsFromModule(Module& M) { + for (Module::iterator MF = M.begin(), E = M.end(); MF != E; ++MF) { + if (MF->isDeclaration()) continue; + BasicBlock::iterator curInstr; + for (Function::iterator BB = MF->begin(), + MFE = MF->end(); BB != MFE; ++BB) { + for (BasicBlock::iterator instr + = BB->begin(), instr_end = BB->end(); + instr != instr_end; ) { + CallInst *CI = dyn_cast(instr); + curInstr = instr; + instr++; + if (CI && CI->getCalledFunction() + && CI->getCalledFunction()->getName() == "printf") { + printfs.push_back(CI); + } + } + } + } +} + +std::string AMDGPUPrintfRuntimeBinding::transPrintfVectorFormat(StringRef str) { + SmallVector opndModifiers; + std::string fmt(str); + size_t curFmtSpecifierIdx = 0; + size_t nextFmtSpecifierIdx = 0; + size_t vecFmtSpecifierIdx = 0; + bool isVectorFormat = false; + static const char convSpecifiers[] = "cdieEfgGaosuxXp"; + curFmtSpecifierIdx = fmt.find_first_of('%',curFmtSpecifierIdx); + std::string transFmt = fmt.substr(0,curFmtSpecifierIdx); + opndModifiers.push_back(""); + while (curFmtSpecifierIdx != std::string::npos) { + nextFmtSpecifierIdx = fmt.find_first_of("%",curFmtSpecifierIdx + 1); + std::string curFmt; + if (nextFmtSpecifierIdx != std::string::npos) { + curFmt = fmt.substr(curFmtSpecifierIdx, + nextFmtSpecifierIdx - curFmtSpecifierIdx); + } + else { + curFmt = fmt.substr(curFmtSpecifierIdx); + } + size_t convSpecifierIdx; + //get modifier and store it in the opndModifiers + if ((convSpecifierIdx = + curFmt.find_first_of(convSpecifiers)) != std::string::npos) { + if (curFmt[convSpecifierIdx - 1] == 'h') { + if (convSpecifierIdx > 1 && curFmt[convSpecifierIdx - 2] == 'h') { + opndModifiers.push_back("hh"); + } else { + opndModifiers.push_back("h"); + + } + } else if (curFmt[convSpecifierIdx - 1] == 'l') { + if (convSpecifierIdx > 1 && curFmt[convSpecifierIdx - 2] == 'h') { + opndModifiers.push_back("hl"); + } else { + opndModifiers.push_back("l"); + } + } else { + opndModifiers.push_back(""); + } + } + std::string compFmt; + vecFmtSpecifierIdx = 0; + // Check if the vector should be printed: + // one of "v16",v2,"v3",v4","v8" indicate + // to vector convension specifier and its elemnts count. + while ((vecFmtSpecifierIdx = curFmt.find_first_of('v',vecFmtSpecifierIdx)) + != std::string::npos) { + isVectorFormat = true; + char elmCount = 0; + char elmFieldSize = 0; + if ((vecFmtSpecifierIdx + 1) < curFmt.length()) { + elmCount = curFmt[vecFmtSpecifierIdx + 1]; + if ((elmCount == '1') && ((vecFmtSpecifierIdx + 2) < curFmt.length()) + && (curFmt[vecFmtSpecifierIdx + 2] == '6')) { + elmCount = 16; + elmFieldSize = 2; + } else if ((('2' <= elmCount) && (elmCount <= '4')) + || (elmCount == '8')) { + elmCount -= '0'; + elmFieldSize = 1; + } + else { + // If there is no element count after 'v', + // continue to look for valid vector specifier. + elmCount = 0; + ++vecFmtSpecifierIdx; + continue; + } + // Rebuild the format to contain the + // convension specifier to each of vector elements. + if (elmCount) { + std::string fmtSuffix; + convSpecifierIdx = curFmt.find_first_of( + convSpecifiers,vecFmtSpecifierIdx + 1); + if (curFmt.length() - 1 != convSpecifierIdx) { + compFmt = curFmt.substr(convSpecifierIdx + 1); + curFmt = curFmt.erase(convSpecifierIdx + 1); + } + if (nextFmtSpecifierIdx == std::string::npos) { + fmtSuffix = curFmt.substr(convSpecifierIdx + 1); + curFmt.erase(convSpecifierIdx + 1); + } + if (!curFmt.empty()) { + // If long value is represented by 4 bytes + // and that llvm long value is represented by 64-bit, + // the string format should be converted to have + // "ll" modifier. + if (opndModifiers.back() == "l" && sizeof(long) == 4) + curFmt.insert(convSpecifierIdx - 1, "l"); + curFmt.erase(vecFmtSpecifierIdx, elmFieldSize+1); + // Donot need "hl" modifier for vector arguments formats. + if (opndModifiers.back() == "hl") { + curFmt.erase(curFmt.find_first_of("hl"),2); + } + for (char i = 0; i < elmCount - 1; ++i) { + transFmt = transFmt + curFmt + ","; + } + } + else { + curFmt = fmtSuffix; + break; + } + if (!fmtSuffix.empty()) { + curFmt += fmtSuffix; + } + } + } + } + transFmt += curFmt; + if (!compFmt.empty()) { + transFmt += compFmt; + } + curFmtSpecifierIdx = nextFmtSpecifierIdx; + } + return transFmt; +} + +bool AMDGPUPrintfRuntimeBinding::lowerPrintfForCpu(Module &M) { + for (SmallVectorImpl::iterator + print_iterate = printfs.begin(), + print_iterate_e = printfs.end(); + print_iterate != print_iterate_e; + ++print_iterate) { + CallInst* CI = dyn_cast( *print_iterate); + + SmallString<16> opConvSpecifiers; + Value *op = CI->getArgOperand(0); + if (auto I = dyn_cast(op)) + op = simplify(I); + ConstantExpr *const_expr = dyn_cast(op); + + if (const_expr) { + GlobalVariable *GVar = dyn_cast( + const_expr->getOperand(0)); + + if (GVar && GVar->hasInitializer()) { + ConstantDataArray *CA = dyn_cast( + GVar->getInitializer()); + if (CA->isString()) { + StringRef str("unknown"); + str = CA->getAsCString(); + DEBUG(dbgs() << "Processing cpu printf format = " + << str.str() << '\n'); + std::string trans = transPrintfVectorFormat(str); + if (trans != str) { + Constant *fmtStrArray = + ConstantDataArray::getString(M.getContext(), trans.c_str(), true); + GlobalVariable* newfmt = new GlobalVariable(M, + fmtStrArray->getType(), + true, + GlobalValue::ExternalLinkage, + fmtStrArray, "fmtPrintf", + NULL, GlobalVariable::NotThreadLocal, + GVar->getType()->getAddressSpace()); + DEBUG(dbgs() << "Format after expanding vectors = " + << *newfmt << '\n'); + Constant* ncexp = ConstantExpr::getBitCast(newfmt, + const_expr->getType()); + const_expr->replaceAllUsesWith(ncexp); + if (CI->getNumArgOperands() > 1 ) { + SmallVector callargs; + callargs.push_back(ncexp); + bool callFix = false; + Type *I32Ty = Type::getInt32Ty(M.getContext()); + for (unsigned argcount = 1; + argcount < CI->getNumArgOperands(); + argcount++) { + Value *arg = CI->getArgOperand(argcount); + Type *argtype = arg->getType(); + if (argtype->getTypeID() == Type::VectorTyID) { + callFix = true; + uint32_t elemSize = + cast(arg->getType())->getNumElements(); + DEBUG(dbgs() << "Need to extract printf vector = " + << *arg << '\n'); + for (uint32_t idxv = 0; idxv < elemSize; ++idxv) { + Value* extr = ExtractElementInst::Create( + arg, ConstantInt::get(I32Ty, idxv, false), + "printfvecext", CI); + DEBUG(dbgs() << "printf vector extract = " << + *extr << '\n'); + DEBUG(dbgs() << "extract's type = " + << *extr->getType() << '\n'); + if (argtype->getScalarType()->isFloatTy() || + argtype->getScalarType()->isHalfTy()) { + Type *doublety = Type::getDoubleTy(M.getContext()); + extr = CastInst::CreateFPCast(extr, doublety, + "defArgPromPrintfVec", CI); + DEBUG(dbgs() << "FPext ins = " << *extr << '\n'); + } + callargs.push_back(extr); + } + } else { + DEBUG(dbgs() << "nonvector = " << *arg << '\n'); + callargs.push_back(arg); + } + } + if (callFix) { + DEBUG(dbgs() << "printf function signature = " + << *CI->getCalledFunction() << '\n'); + CallInst *newprintf = CallInst::Create(CI->getCalledFunction(), + callargs, "printf_", CI); + DEBUG(dbgs() << "Before transformation of vector = " + << *CI << '\n'); + DEBUG(dbgs() << "Now = " << *newprintf << '\n'); + CI->eraseFromParent(); + } + } + } + } + } + } + } + return true; +} + +bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(Module &M) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32Ty = Type::getInt32Ty(Ctx); + unsigned UniqID = 0; + // NB: This is important for this string size to be divizable by 4 + const char non_literal_str[4] = "???"; + + for (SmallVectorImpl::iterator + print_iterate = printfs.begin(), + print_iterate_e = printfs.end(); + print_iterate != print_iterate_e; + ++print_iterate) { + CallInst* CI = dyn_cast( *print_iterate); + + unsigned num_ops = CI->getNumArgOperands(); + + SmallString<16> opConvSpecifiers; + Value *op = CI->getArgOperand(0); + if (auto I = dyn_cast(op)) + op = simplify(I); + + ConstantExpr *const_expr = dyn_cast(op); + + if (const_expr) { + GlobalVariable *GVar = dyn_cast( + const_expr->getOperand(0)); + + StringRef str("unknown"); + if (GVar && GVar->hasInitializer()) { + ConstantDataArray *CA = dyn_cast( + GVar->getInitializer()); + if (CA->isString()) { + str = CA->getAsCString(); + } + // + // we need this call to ascertain + // that we are printing a string + // or a pointer. It takes out the + // specifiers and fills up the first + // arg + getConversionSpecifiers( opConvSpecifiers, str, num_ops - 1); + } + // Add metadata for the string + std::string astreamholder; + raw_string_ostream sizes(astreamholder); + int sum = DWORD_ALIGN; + sizes << CI->getNumArgOperands() -1; + sizes << ':'; + for (unsigned argcount = 1; + argcount < CI->getNumArgOperands() + && argcount <= opConvSpecifiers.size(); + argcount++) { + Value *arg = CI->getArgOperand(argcount); + Type *argtype = arg->getType(); + unsigned argsize = TD->getTypeAllocSizeInBits(argtype); + argsize = argsize/8; + // + // ArgSize by design should be a multiple of DWORD_ALIGN, + // expand the arguments that do not follow this rule. + // + if (argsize % DWORD_ALIGN != 0) { + llvm::Type* resType = llvm::Type::getInt32Ty(Ctx); + VectorType* llvmVecType = llvm::dyn_cast(argtype); + int numEle = llvmVecType ? llvmVecType->getNumElements() : 1; + if (llvmVecType && numEle > 1) + resType = llvm::VectorType::get(resType, numEle);//static_cast(numEle)); + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + if (opConvSpecifiers[argcount - 1] == 'x' || + opConvSpecifiers[argcount - 1] == 'X' || + opConvSpecifiers[argcount - 1] == 'u' || + opConvSpecifiers[argcount - 1] == 'o') + arg = Builder.CreateZExt(arg, resType); + else + arg = Builder.CreateSExt(arg, resType); + argtype = arg->getType(); + argsize = TD->getTypeAllocSizeInBits(argtype); + argsize = argsize / 8; + CI->setOperand(argcount, arg); + } + if (opConvSpecifiers[argcount - 1] == 'f') { + ConstantFP *fpCons = dyn_cast(arg); + if (fpCons) + argsize = 4; + else { + FPExtInst *fpext = dyn_cast(arg); + if (fpext && fpext->getType()->isDoubleTy() && + fpext->getOperand(0)->getType()->isFloatTy()) + argsize = 4; + } + } + if (shouldPrintAsStr(opConvSpecifiers[argcount - 1], argtype)) { + if (ConstantExpr *strC = dyn_cast(arg)) { + GlobalVariable *strG + = dyn_cast(strC->getOperand(0)); + if (strG && strG->hasInitializer()) { + Constant *Init = strG->getInitializer(); + ConstantDataArray *strCA = dyn_cast(Init); + if (Init->isZeroValue() || strCA->isString()) { + size_t size_str = Init->isZeroValue() ? 1 : + (strlen(strCA->getAsCString().data()) + 1); + size_t rem = size_str % DWORD_ALIGN; + size_t nsize_str = 0; + DEBUG(dbgs() << "Printf string original size = " << size_str << '\n'); + if (rem) { + nsize_str = size_str + (DWORD_ALIGN - rem); + } else { + nsize_str = size_str; + } + argsize = nsize_str; + } + } else { + argsize = sizeof(non_literal_str); + } + } else { + argsize = sizeof(non_literal_str); + } + } + DEBUG(dbgs() << "Printf argsize (in buffer) = " + << argsize << " for type: " << *argtype << '\n'); + sizes << argsize << ':'; + sum += argsize; + } + DEBUG(dbgs() << "Printf format string in source = " + << str.str() << '\n'); + for (size_t i = 0; i < str.size(); ++i) { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser + switch (str[i]) { + case '\a': + sizes << "\\a"; + break; + case '\b': + sizes << "\\b"; + break; + case '\f': + sizes << "\\f"; + break; + case '\n': + sizes << "\\n"; + break; + case '\r': + sizes << "\\r"; + break; + case '\v': + sizes << "\\v"; + break; + case ':': + // ':' cannot be scanned by Flex, as it is defined as a delimiter + // Replace it with it's octal representation \72 + sizes << "\\72"; + break; + default: + sizes << str[i]; + break; + } + } + + // Insert the printf_alloc call + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + AttributeSet attr = AttributeSet::get(Ctx, AttributeSet::FunctionIndex, + Attribute::NoUnwind); + + Type *sizetTy = Type::getInt32Ty(Ctx); + + Type *Tys_alloc[1] = { sizetTy }; + Type *I8Ptr = PointerType::get( Type::getInt8Ty(Ctx), 1); + FunctionType *FTy_alloc + = FunctionType::get( I8Ptr, Tys_alloc, false); + Constant *printf_alloc_fn + = M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, attr); + Function *afn = dyn_cast(printf_alloc_fn); + afn->setCallingConv(llvm::CallingConv::SPIR_FUNC); + DEBUG(dbgs() << "inserting printf_alloc decl, an extern @ pre-link:"); + DEBUG(dbgs() << *afn); + + DEBUG(dbgs() << "Printf metadata = " << sizes.str() << '\n'); + std::string fmtstr = itostr(++UniqID) + ":" + sizes.str().c_str(); + MDString *fmtStrArray + = MDString::get( Ctx, fmtstr ); + + + // Instead of creating global variables, the + // printf format strings are extracted + // and passed as metadata. This avoids + // polluting llvm's symbol tables in this module. + // Metadata is going to be extracted + // by the backend passes and inserted + // into the OpenCL binary as appropriate. + StringRef amd("llvm.printf.fmts"); + NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd); + MDNode *myMD = MDNode::get(Ctx,fmtStrArray); + metaD->addOperand(myMD); + Value *sumC = ConstantInt::get( sizetTy, sum, false); + SmallVector alloc_args; + alloc_args.push_back(sumC); + CallInst *pcall = CallInst::Create( afn, alloc_args, + "printf_alloc_fn", CI); + pcall->setCallingConv(llvm::CallingConv::SPIR_FUNC); + + // + // Insert code to split basicblock with a + // piece of hammock code. + // basicblock splits after buffer overflow check + // + ConstantPointerNull *zeroIntPtr + = ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), + 1)); + ICmpInst *cmp + = dyn_cast( + Builder.CreateICmpNE(pcall, zeroIntPtr, "")); + if (!CI->use_empty()) { + Value *result = Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, + "printf_res"); + CI->replaceAllUsesWith(result); + } + SplitBlock(CI->getParent(), cmp); + TerminatorInst *brnch + = SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false); + + Builder.SetInsertPoint(brnch); + + // store unique printf id in the buffer + // + SmallVector ZeroIdxList; + ConstantInt* zeroInt + = ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10)); + ZeroIdxList.push_back(zeroInt); + + GetElementPtrInst *buffer_idx + = dyn_cast( + GetElementPtrInst::Create(nullptr, + pcall, ZeroIdxList, "PrintBuffID", brnch)); + + Type *idPointer + = PointerType::get(I32Ty, GlobalAddrspace); + Value *id_gep_cast + = new BitCastInst( buffer_idx, idPointer, + "PrintBuffIdCast", brnch); + + StoreInst* stbuff + = new StoreInst( ConstantInt::get(I32Ty, UniqID), id_gep_cast); + stbuff->insertBefore(brnch); // to remove unused variable warning + + SmallVector FourthIdxList; + ConstantInt* fourInt + = ConstantInt::get(Ctx, APInt( + 32, StringRef("4"), 10)); + + FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id + // the following GEP is the buffer pointer + buffer_idx + = cast(GetElementPtrInst::Create(nullptr, + pcall, FourthIdxList, "PrintBuffGep", brnch)); + + Type* Int32Ty = Type::getInt32Ty(Ctx); + Type* Int64Ty = Type::getInt64Ty(Ctx); + for (unsigned argcount = 1; + argcount < CI->getNumArgOperands() + && argcount <= opConvSpecifiers.size(); + argcount++) { + Value *arg = CI->getArgOperand(argcount); + Type *argType = arg->getType(); + SmallVector whatToStore; + if (argType->isFPOrFPVectorTy() + && (argType->getTypeID() != Type::VectorTyID)) { + Type *iType = (argType->isFloatTy()) ? Int32Ty : Int64Ty; + if (opConvSpecifiers[argcount - 1] == 'f') { + ConstantFP *fpCons = dyn_cast(arg); + if (fpCons) { + APFloat Val(fpCons->getValueAPF()); + bool lost = false; + Val.convert(APFloat::IEEEsingle, + APFloat::rmNearestTiesToEven, + &lost); + arg = ConstantFP::get(Ctx, Val); + iType = Int32Ty; + } else { + FPExtInst *fpext = dyn_cast(arg); + if (fpext && fpext->getType()->isDoubleTy() + && fpext->getOperand(0)->getType()->isFloatTy()) { + arg = fpext->getOperand(0); + iType = Int32Ty; + } + } + } + arg = new BitCastInst(arg, iType, "PrintArgFP", brnch); + whatToStore.push_back(arg); + } else if (argType->getTypeID() == Type::PointerTyID) { + if (shouldPrintAsStr(opConvSpecifiers[argcount - 1], argType)) { + const char *s = non_literal_str; + if (ConstantExpr *strC = dyn_cast(arg)) { + GlobalVariable *strG + = dyn_cast(strC->getOperand(0)); + if (strG && strG->hasInitializer()) { + Constant *Init = strG->getInitializer(); + ConstantDataArray *strCA = dyn_cast(Init); + if (Init->isZeroValue() || strCA->isString()) { + s = Init->isZeroValue() ? "" : strCA->getAsCString().data(); + } + } + } + size_t size_str = strlen(s) + 1; + size_t rem = size_str % DWORD_ALIGN; + size_t nsize_str = 0; + if (rem) { + nsize_str = size_str + (DWORD_ALIGN - rem); + } else { + nsize_str = size_str; + } + if (s[0]) { + char *mynewstr = new char[nsize_str](); + strcpy(mynewstr, s); + int numints = nsize_str/4; + int charc = 0; + while(numints) { + int anum = *(int*)(mynewstr+charc); + charc += 4; + numints--; + Value *anumV = ConstantInt::get( Int32Ty, anum, false); + whatToStore.push_back(anumV); + } + delete mynewstr; + } else { + // Empty string, give a hint to RT it is no NULL + Value *anumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false); + whatToStore.push_back(anumV); + } + } else { + uint64_t Size = TD->getTypeAllocSizeInBits(argType); + assert((Size == 32 || Size == 64) && "unsupported size"); + Type* DstType = (Size == 32) ? Int32Ty : Int64Ty; + arg = new PtrToIntInst(arg, DstType, + "PrintArgPtr", brnch); + whatToStore.push_back(arg); + } + } else if (argType->getTypeID() == Type::VectorTyID) { + Type *iType = NULL; + uint32_t eleCount = cast(argType)->getNumElements(); + uint32_t eleSize = argType->getScalarSizeInBits(); + uint32_t totalSize = eleCount * eleSize; + if (eleCount == 3) { + IntegerType *int32ty + = Type::getInt32Ty(argType->getContext()); + Constant* indices[4] + = { ConstantInt::get(int32ty, 0), + ConstantInt::get(int32ty, 1), + ConstantInt::get(int32ty, 2), + ConstantInt::get(int32ty, 2) + }; + Constant* mask = ConstantVector::get(indices); + ShuffleVectorInst* shuffle + = new ShuffleVectorInst(arg, arg, mask); + shuffle->insertBefore(brnch); + arg = shuffle; + argType = arg->getType(); + totalSize += eleSize; + } + switch (eleSize) { + default: + eleCount = totalSize / 64; + iType = dyn_cast( + Type::getInt64Ty( + argType->getContext())); + break; + case 8: + if (eleCount >= 8) { + eleCount = totalSize / 64; + iType = dyn_cast( + Type::getInt64Ty( + argType->getContext())); + } else if (eleCount >= 3) { + eleCount = 1; + iType = dyn_cast( + Type::getInt32Ty( + argType->getContext())); + } else { + eleCount = 1; + iType = dyn_cast( + Type::getInt16Ty( + argType->getContext())); + } + break; + case 16: + if (eleCount >= 3) { + eleCount = totalSize / 64; + iType = dyn_cast( + Type::getInt64Ty( + argType->getContext())); + } else { + eleCount = 1; + iType = dyn_cast( + Type::getInt32Ty( + argType->getContext())); + } + break; + } + if (eleCount > 1) { + iType = dyn_cast( + VectorType::get( + iType, eleCount)); + } + arg = new BitCastInst(arg, iType, "PrintArgVect", brnch); + whatToStore.push_back(arg); + } else { + whatToStore.push_back(arg); + } + + for ( SmallVectorImpl::iterator + w_iterate = whatToStore.begin(), + w_iterate_e = whatToStore.end(); + w_iterate != w_iterate_e; ) { + Value* thebtcast = *w_iterate; + unsigned argsize + = TD->getTypeAllocSizeInBits(thebtcast->getType())/8; + SmallVector buffOffset; + buffOffset.push_back( + ConstantInt::get( I32Ty, argsize)); + + Type *argPointer + = PointerType::get( thebtcast->getType(), 1); + Value *casted_gep + = new BitCastInst( buffer_idx, argPointer, + "PrintBuffPtrCast", brnch); + StoreInst* stbuff + = new StoreInst( + thebtcast, casted_gep, brnch); + DEBUG(dbgs() << "inserting store to printf buffer:\n" + << *stbuff << '\n'); + ++w_iterate; + if (w_iterate == w_iterate_e + && argcount+1 == CI->getNumArgOperands()) + break; + buffer_idx + = dyn_cast(GetElementPtrInst::Create( + nullptr, buffer_idx, buffOffset, "PrintBuffNextPtr", brnch)); + DEBUG(dbgs() << "inserting gep to the printf buffer:\n" + << *buffer_idx << '\n'); + } + } + } + } + //erase the printf calls + for (SmallVectorImpl::iterator + print_iterate = printfs.begin(), + print_iterate_e = printfs.end(); + print_iterate != print_iterate_e; + ++print_iterate) { + CallInst* CI + = dyn_cast( *print_iterate); + CI->eraseFromParent(); + } + return true; +} + +static bool isX86Triple(const llvm::Triple &Triple) { + return Triple.getArch() == llvm::Triple::x86 + || Triple.getArch() == llvm::Triple::x86_64; +} + +bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) { + if (isX86Triple(Triple(M.getTargetTriple()))) { + if (!prepare(M)) + return false; + return lowerPrintfForCpu(M); + } else { + if (!prepare(M)) + return false; + return lowerPrintfForGpu(M); + } +} + +const char* AMDGPUPrintfRuntimeBinding::getPassName() const { + return "AMD Printf lowering part 1"; +} + +bool AMDGPUPrintfRuntimeBinding::doInitialization(Module &M) { + return false; +} + +bool AMDGPUPrintfRuntimeBinding::doFinalization(Module &M) { + return false; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -50,6 +50,7 @@ TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + virtual void addPreLinkPasses(PassManagerBase &) override; }; //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -33,6 +33,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; @@ -82,6 +83,7 @@ initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); + initializeAMDGPUPrintfRuntimeBindingPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -166,6 +168,11 @@ FSAttr.getValueAsString(); } +void AMDGPUTargetMachine::addPreLinkPasses(PassManagerBase &PM) { + PM.add(llvm::createAMDGPUPrintfRuntimeBinding()); +} + + //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + AMDGPUPrinfRuntimeBinding.cpp GCNHazardRecognizer.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp Index: test/CodeGen/AMDGPU/printf.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/printf.ll @@ -0,0 +1,59 @@ +; RUN: opt -mtriple=amdgcn--amdhsa -amdgpu-printf-runtime-binding -mcpu=fiji -S < %s | FileCheck %s +; CHECK-LABEL: entry +; CHECK: call spir_func i8 addrspace(1)* @__printf_alloc +; CHECK-LABEL: entry.split +; CHECK: icmp ne i8 addrspace(1)* %printf_alloc_fn, null +; CHECK: %PrintBuffID = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 0 +; CHECK: %PrintBuffIdCast = bitcast i8 addrspace(1)* %PrintBuffID to i32 addrspace(1)* +; CHECK: store i32 1, i32 addrspace(1)* %PrintBuffIdCast +; CHECK: %PrintBuffGep = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 4 +; CHECK: %PrintArgPtr = ptrtoint i8* %arraydecay to i64 +; CHECK: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffGep to i64 addrspace(1)* +; CHECK: store i64 %PrintArgPtr, i64 addrspace(1)* %PrintBuffPtrCast +; CHECK: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i32 8 +; CHECK: %PrintBuffPtrCast1 = bitcast i8 addrspace(1)* %PrintBuffNextPtr to i32 addrspace(1)* +; CHECK: store i32 %3, i32 addrspace(1)* %PrintBuffPtrCast1 + +@test_kernel.str = private unnamed_addr constant [9 x i8] c"globalid\00", align 1 +@.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1 + +define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %in.addr = alloca i32 addrspace(1)*, align 4 + %out.addr = alloca i32 addrspace(1)*, align 4 + %n = alloca i32, align 4 + %str = alloca [9 x i8], align 1 + store i32 addrspace(1)* %in, i32 addrspace(1)** %in.addr, align 4 + store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 4 + %0 = bitcast i32* %n to i8* + %call = call i64 @_Z13get_global_idj(i32 0) #5 + %conv = trunc i64 %call to i32 + store i32 %conv, i32* %n, align 4 + %1 = bitcast [9 x i8]* %str to i8* + %2 = bitcast [9 x i8]* %str to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @test_kernel.str, i32 0, i32 0), i64 9, i32 1, i1 false) + %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %str, i32 0, i32 0 + %3 = load i32, i32* %n, align 4 + %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %3) + %4 = load i32, i32* %n, align 4 + %idxprom = sext i32 %4 to i64 + %5 = load i32 addrspace(1)*, i32 addrspace(1)** %in.addr, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom + %6 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %7 = load i32, i32* %n, align 4 + %idxprom2 = sext i32 %7 to i64 + %8 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom2 + store i32 %6, i32 addrspace(1)* %arrayidx3, align 4 + %9 = bitcast [9 x i8]* %str to i8* + %10 = bitcast i32* %n to i8* + ret void +} + +; Function Attrs: nounwind readnone +declare i64 @_Z13get_global_idj(i32) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 + +declare i32 @printf(i8 addrspace(2)*, ...) #3