Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -272,6 +272,7 @@ LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") +LANGOPT(DelayedPrintf, 1, 0, "version onf printf function to be used, hostcall or buffer based") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation") Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -981,6 +981,10 @@ TargetOpts<"NVPTXUseShortPointers">, DefaultFalse, PosFlag, NegFlag>; +def fdelayed_printf : Flag<["-"], "fdelayed-printf">, + HelpText<"Specifies which version of printf is to be used while CodeGen">, + Flags<[CC1Option]>, + MarshallingInfoFlag>; def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">, HelpText<"Specify default stream. The default value is 'legacy'. (HIP only)">, Flags<[CC1Option]>, Index: clang/lib/Basic/Builtins.cpp =================================================================== --- clang/lib/Basic/Builtins.cpp +++ clang/lib/Basic/Builtins.cpp @@ -89,11 +89,15 @@ bool CUDAUnsupported = !LangOpts.CUDA && BuiltinInfo.Langs == CUDA_LANG; bool CPlusPlusUnsupported = !LangOpts.CPlusPlus && BuiltinInfo.Langs == CXX_LANG; + // dependency of printf on "-fdelayed-printf" option + bool PrintfUnsupported = LangOpts.HIP && + llvm::StringRef(BuiltinInfo.Name).equals("printf") && + LangOpts.DelayedPrintf; return !BuiltinsUnsupported && !CorBuiltinsUnsupported && !MathBuiltinsUnsupported && !OclCUnsupported && !OclGASUnsupported && !OclPipeUnsupported && !OclDSEUnsupported && !OpenMPUnsupported && !GnuModeUnsupported && !MSModeUnsupported && !ObjCUnsupported && - !CPlusPlusUnsupported && !CUDAUnsupported; + !CPlusPlusUnsupported && !CUDAUnsupported && !PrintfUnsupported; } /// initializeBuiltins - Mark the identifiers for all the builtins with their Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -4652,8 +4652,19 @@ } CmdArgs.push_back("-aux-triple"); CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); + + if (JA.isDeviceOffloading(Action::OFK_HIP)) { + // Device side compilation printf + if (Args.getLastArg(options::OPT_fdelayed_printf)) + CmdArgs.push_back("-fdelayed-printf"); + } } + // unconditionally claim the pritnf option now to avoid unused diagnostic + // TODO: OpenCL targets need to use this option too + if (const Arg *PF = Args.getLastArg(options::OPT_fdelayed_printf)) + PF->claim(); + if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false)) { CmdArgs.push_back("-fsycl-is-device"); Index: llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -62,7 +63,7 @@ void getConversionSpecifiers(SmallVectorImpl &OpConvSpecifiers, StringRef fmt, size_t num_ops) const; - bool shouldPrintAsStr(char Specifier, Type *OpType) const; + bool shouldPrintAsStr(char Specifier) const { return (Specifier == 's'); } bool lowerPrintfForGpu(Module &M); Value *simplify(Instruction *I, const TargetLibraryInfo *TLI, @@ -131,79 +132,111 @@ } } -bool AMDGPUPrintfRuntimeBindingImpl::shouldPrintAsStr(char Specifier, - Type *OpType) const { - if (Specifier != 's') - return false; - const PointerType *PT = dyn_cast(OpType); - if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) - return false; - Type *ElemType = PT->getContainedType(0); - if (ElemType->getTypeID() != Type::IntegerTyID) - return false; - IntegerType *ElemIType = cast(ElemType); - return ElemIType->getBitWidth() == 8; +// This function is essentially a copy from the file +// Transforms/Utils/AMDGPUEmitPrintf.cpp +static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) { + auto *Prev = Builder.GetInsertBlock(); + Module *M = Prev->getModule(); + + auto CharZero = Builder.getInt8(0); + auto One = Builder.getInt64(1); + auto Zero = Builder.getInt64(0); + auto Int64Ty = Builder.getInt64Ty(); + + // The length is either zero for a null pointer, or the computed value for an + // actual string. We need a join block for a phi that represents the final + // value. + BasicBlock *Join = nullptr; + if (Prev->getTerminator()) { + Join = Prev->splitBasicBlock(Builder.GetInsertPoint(), "strlen.join"); + Prev->getTerminator()->eraseFromParent(); + } else { + Join = + BasicBlock::Create(M->getContext(), "strlen.join", Prev->getParent()); + } + BasicBlock *While = BasicBlock::Create(M->getContext(), "strlen.while", + Prev->getParent(), Join); + BasicBlock *WhileDone = BasicBlock::Create( + M->getContext(), "strlen.while.done", Prev->getParent(), Join); + + // Emit an early return for when the pointer is null. + Builder.SetInsertPoint(Prev); + auto CmpNull = + Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType())); + BranchInst::Create(Join, While, CmpNull, Prev); + + // Entry to the while loop. + Builder.SetInsertPoint(While); + + auto PtrPhi = Builder.CreatePHI(Str->getType(), 2); + PtrPhi->addIncoming(Str, Prev); + auto PtrNext = Builder.CreateGEP(Builder.getInt8Ty(), PtrPhi, One); + PtrPhi->addIncoming(PtrNext, While); + + // Condition for the while loop. + auto Data = Builder.CreateLoad(Builder.getInt8Ty(), PtrPhi); + auto Cmp = Builder.CreateICmpEQ(Data, CharZero); + Builder.CreateCondBr(Cmp, WhileDone, While); + + // Add one to the computed length. + Builder.SetInsertPoint(WhileDone, WhileDone->begin()); + auto Begin = Builder.CreatePtrToInt(Str, Int64Ty); + auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty); + auto Len = Builder.CreateSub(End, Begin); + Len = Builder.CreateAdd(Len, One); + + // Final join. + BranchInst::Create(Join, WhileDone); + Builder.SetInsertPoint(Join, Join->begin()); + auto LenPhi = Builder.CreatePHI(Len->getType(), 2); + LenPhi->addIncoming(Len, WhileDone); + LenPhi->addIncoming(Zero, Prev); + + return LenPhi; } bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - Type *I32Ty = Type::getInt32Ty(Ctx); + Type *Int32Ty = Type::getInt32Ty(Ctx); + Type *Int64Ty = Type::getInt64Ty(Ctx); unsigned UniqID = 0; - // NB: This is important for this string size to be divisible by 4 - const char NonLiteralStr[4] = "???"; for (auto *CI : Printfs) { unsigned NumOps = CI->arg_size(); + StringRef Str; SmallString<16> OpConvSpecifiers; Value *Op = CI->getArgOperand(0); - if (auto LI = dyn_cast(Op)) { - Op = LI->getPointerOperand(); - for (auto *Use : Op->users()) { - if (auto SI = dyn_cast(Use)) { - Op = SI->getValueOperand(); - break; - } - } - } - - if (auto I = dyn_cast(Op)) { - Value *Op_simplified = - simplify(I, &GetTLI(*I->getFunction()), &GetDT(*I->getFunction())); - if (Op_simplified) - Op = Op_simplified; - } + // helper struct to package the string related data + typedef struct S { + StringRef Str; + llvm::Value *RealSize; + llvm::Value *AlignedSize; - ConstantExpr *ConstExpr = dyn_cast(Op); + S(StringRef str, llvm::Value *RS, llvm::Value *AS) + : Str(str), RealSize(RS), AlignedSize(AS) {} + } StringData; - if (ConstExpr) { - GlobalVariable *GVar = dyn_cast(ConstExpr->getOperand(0)); + if (getConstantStringInfo(Op, Str) && !Str.empty()) { + // we need this call to ascertain + // that we are printing a string + // or a pointer. It takes out the + // specifiers and fills up the first + // arg + getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1); - StringRef Str("unknown"); - if (GVar && GVar->hasInitializer()) { - auto *Init = GVar->getInitializer(); - if (auto *CA = dyn_cast(Init)) { - if (CA->isString()) - Str = CA->getAsCString(); - } else if (isa(Init)) { - Str = ""; - } - // - // we need this call to ascertain - // that we are printing a string - // or a pointer. It takes out the - // specifiers and fills up the first - // arg - getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1); - } // Add metadata for the string std::string AStreamHolder; raw_string_ostream Sizes(AStreamHolder); int Sum = DWORD_ALIGN; Sizes << CI->arg_size() - 1; Sizes << ':'; + uint32_t NonConstStringCnt = 0; + Value *NonConstStringValue = nullptr; + FunctionCallee strlenFunc, AlignFunc; + SmallVector StringContents; for (unsigned ArgCount = 1; ArgCount < CI->arg_size() && ArgCount <= OpConvSpecifiers.size(); ArgCount++) { @@ -216,11 +249,11 @@ // expand the arguments that do not follow this rule. // if (ArgSize % DWORD_ALIGN != 0) { - llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx); + llvm::Type *ResType = Int32Ty; auto *LLVMVecType = llvm::dyn_cast(ArgType); int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1; if (LLVMVecType && NumElem > 1) - ResType = llvm::FixedVectorType::get(ResType, NumElem); + ResType = llvm::FixedVectorType::get(Int32Ty, NumElem); Builder.SetInsertPoint(CI); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); if (OpConvSpecifiers[ArgCount - 1] == 'x' || @@ -246,34 +279,56 @@ ArgSize = 4; } } - if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { - if (auto *ConstExpr = dyn_cast(Arg)) { - auto *GV = dyn_cast(ConstExpr->getOperand(0)); - if (GV && GV->hasInitializer()) { - Constant *Init = GV->getInitializer(); - bool IsZeroValue = Init->isZeroValue(); - auto *CA = dyn_cast(Init); - if (IsZeroValue || (CA && CA->isString())) { - size_t SizeStr = - IsZeroValue ? 1 : (strlen(CA->getAsCString().data()) + 1); - size_t Rem = SizeStr % DWORD_ALIGN; - size_t NSizeStr = 0; - LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr - << '\n'); - if (Rem) { - NSizeStr = SizeStr + (DWORD_ALIGN - Rem); - } else { - NSizeStr = SizeStr; - } - ArgSize = NSizeStr; - } + if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1])) { + StringRef S; + if (getConstantStringInfo(Arg, S, /*TrimAtNul*/ false)) { + size_t SizeStr = S.size(); + size_t Rem = SizeStr % DWORD_ALIGN; + LLVM_DEBUG(dbgs() + << "Printf string original size = " << SizeStr << '\n'); + size_t NSizeStr; + if (Rem) { + NSizeStr = SizeStr + (DWORD_ALIGN - Rem); } else { - ArgSize = sizeof(NonLiteralStr); + NSizeStr = SizeStr; } + ArgSize = NSizeStr; + + auto AlignedSize = ConstantInt::get(Int32Ty, ArgSize, false); + StringContents.push_back(StringData(S, AlignedSize, AlignedSize)); + Sum += ArgSize; } else { - ArgSize = sizeof(NonLiteralStr); + Builder.SetInsertPoint(CI); + auto strlen = getStrlenWithNull(Builder, Arg); + + // Align the computed length to next 4 byte boundary + auto Temp = Builder.CreateAdd( + strlen, ConstantInt::get(strlen->getType(), 3U)); + auto alignedLen = Builder.CreateAnd( + Temp, ConstantInt::get(Type::getInt64Ty(Ctx), ~3U)); + + if (NonConstStringCnt) { + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + auto Val = Builder.CreateAdd(alignedLen, NonConstStringValue, + "cumulativeAdd"); + NonConstStringValue = Val; + } else + NonConstStringValue = alignedLen; + + // actual string not known here, hence keep the field empty. + StringContents.push_back(StringData("", strlen, alignedLen)); + NonConstStringCnt++; } + + // The non const string sizes in metadata are always updated + // to be size of the pointer(8 bytes). + // The runtime handles the calculation of actual string sizes. + Sizes << 8 << ':'; + + // string argument handled, continue with next argument + continue; } + LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize << " for type: " << *ArgType << '\n'); Sizes << ArgSize << ':'; @@ -281,6 +336,7 @@ } LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str() << '\n'); + for (char C : Str) { // Rest of the C escape sequences (e.g. \') are handled correctly // by the MDParser @@ -321,9 +377,7 @@ AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind); - Type *SizetTy = Type::getInt32Ty(Ctx); - - Type *Tys_alloc[1] = {SizetTy}; + Type *Tys_alloc[1] = {Int32Ty}; Type *I8Ty = Type::getInt8Ty(Ctx); Type *I8Ptr = PointerType::get(I8Ty, 1); FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false); @@ -345,8 +399,14 @@ NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd); MDNode *myMD = MDNode::get(Ctx, fmtStrArray); metaD->addOperand(myMD); - Value *sumC = ConstantInt::get(SizetTy, Sum, false); + Value *sumC = ConstantInt::get(Type::getInt64Ty(Ctx), Sum, false); SmallVector alloc_args; + if (NonConstStringValue) + sumC = Builder.CreateAdd(NonConstStringValue, sumC); + + // Truncate the string size to 32 bits, + sumC = Builder.CreateTrunc(sumC, Int32Ty); + alloc_args.push_back(sumC); CallInst *pcall = CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI); @@ -361,7 +421,7 @@ auto *cmp = cast(Builder.CreateICmpNE(pcall, zeroIntPtr, "")); if (!CI->use_empty()) { Value *result = - Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res"); + Builder.CreateSExt(Builder.CreateNot(cmp), Int32Ty, "printf_res"); CI->replaceAllUsesWith(result); } SplitBlock(CI->getParent(), cmp); @@ -376,11 +436,11 @@ I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID", Brnch); - Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS); + Type *idPointer = PointerType::get(Int32Ty, AMDGPUAS::GLOBAL_ADDRESS); Value *id_gep_cast = new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch); - new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch); + new StoreInst(ConstantInt::get(Int32Ty, UniqID), id_gep_cast, Brnch); // 1st 4 bytes hold the printf_id // the following GEP is the buffer pointer @@ -388,8 +448,7 @@ I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 4)), "PrintBuffGep", Brnch); - Type *Int32Ty = Type::getInt32Ty(Ctx); - Type *Int64Ty = Type::getInt64Ty(Ctx); + int curStringIdx = 0; for (unsigned ArgCount = 1; ArgCount < CI->arg_size() && ArgCount <= OpConvSpecifiers.size(); ArgCount++) { @@ -417,44 +476,64 @@ Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch); WhatToStore.push_back(Arg); } else if (ArgType->getTypeID() == Type::PointerTyID) { - if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { - const char *S = NonLiteralStr; - if (auto *ConstExpr = dyn_cast(Arg)) { - auto *GV = dyn_cast(ConstExpr->getOperand(0)); - if (GV && GV->hasInitializer()) { - Constant *Init = GV->getInitializer(); - bool IsZeroValue = Init->isZeroValue(); - auto *CA = dyn_cast(Init); - if (IsZeroValue || (CA && CA->isString())) { - S = IsZeroValue ? "" : CA->getAsCString().data(); + if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1])) { + // get the string "Arg" points to + if (auto ConstVal = dyn_cast( + StringContents[curStringIdx].RealSize)) { + // I guess its safe to use ConstVal directly without guards here + // since we know the string contents already, + auto NSizeStr = ConstVal->getZExtValue(); + StringRef S = StringContents[curStringIdx].Str; + + // since we know string contents, push them to printf buffer + // as 4 byte chunks rather than using memcpy. + if (S[0]) { + char *MyNewStr = new char[NSizeStr](); + strcpy(MyNewStr, S.str().c_str()); + int NumInts = NSizeStr / 4; + int CharC = 0; + while (NumInts) { + int ANum = *(int *)(MyNewStr + CharC); + CharC += 4; + NumInts--; + Value *ANumV = ConstantInt::get(Int32Ty, ANum, false); + WhatToStore.push_back(ANumV); } - } - } - size_t SizeStr = strlen(S) + 1; - size_t Rem = SizeStr % DWORD_ALIGN; - size_t NSizeStr = 0; - if (Rem) { - NSizeStr = SizeStr + (DWORD_ALIGN - Rem); - } else { - NSizeStr = SizeStr; - } - if (S[0]) { - char *MyNewStr = new char[NSizeStr](); - strcpy(MyNewStr, S); - int NumInts = NSizeStr / 4; - int CharC = 0; - while (NumInts) { - int ANum = *(int *)(MyNewStr + CharC); - CharC += 4; - NumInts--; - Value *ANumV = ConstantInt::get(Int32Ty, ANum, false); + delete[] MyNewStr; + } else { + // Empty string, give a hint to RT it is no NULL + Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false); WhatToStore.push_back(ANumV); } - delete[] MyNewStr; + curStringIdx++; } else { - // Empty string, give a hint to RT it is no NULL - Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false); - WhatToStore.push_back(ANumV); + auto val = StringContents[curStringIdx].RealSize; + Type *Tys[] = {BufferIdx->getType(), Arg->getType(), + val->getType()}; + Function *TheFn = + Intrinsic::getDeclaration(&M, Intrinsic::memcpy, Tys); + SmallVector BuffOffset; + + Value *Args[] = {BufferIdx, Arg, val, + ConstantInt::get(Type::getInt1Ty(Ctx), false)}; + + // This copies the contents of the string, however the next offset + // is at aligned length, the extra space that might be created due + // to alignment padding is not populated with any specific value + // here, I feel this would be safe as long as runtime is sync with + // the offsets. + CallInst::Create(TheFn->getFunctionType(), TheFn, Args, + llvm::None, "", Brnch); + + Builder.SetInsertPoint(Brnch); + BuffOffset.push_back(StringContents[curStringIdx].AlignedSize); + BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset, + "PrintBuffNextPtr", Brnch); + LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n" + << *BufferIdx << '\n'); + curStringIdx++; + // done with current argument, move to next + continue; } } else { uint64_t Size = TD->getTypeAllocSizeInBits(ArgType); @@ -516,7 +595,7 @@ unsigned ArgSize = TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8; SmallVector BuffOffset; - BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize)); + BuffOffset.push_back(ConstantInt::get(Int32Ty, ArgSize)); Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1); Value *CastedGEP = Index: llvm/test/CodeGen/AMDGPU/hip-delayed-printf.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/hip-delayed-printf.ll @@ -0,0 +1,57 @@ +; RUN: opt -mtriple=amdgcn--amdhsa -passes=amdgpu-printf-runtime-binding -S < %s | FileCheck --check-prefix=FUNC --check-prefix=GCN --check-prefix=METADATA %s + +; FUNC-LABEL: @test_kernel( +; GCN-LABEL: entry +; GCN-LABEL: strlen.while +; GCN: br i1 %6, label %strlen.while.done, label %strlen.while +; GCN-LABEL: strlen.join +; GCN: %12 = add i64 %11, 3 +; GCN: %13 = and i64 %12, 4294967292 +; GCN: %14 = add i64 %13, 4 +; GCN: %15 = trunc i64 %14 to i32 +; GCN: %printf_alloc_fn = call ptr addrspace(1) @__printf_alloc(i32 %15) +; GCN-LABEL: strlen.join.split +; GCN: %16 = icmp ne ptr addrspace(1) %printf_alloc_fn, null +; GCN: br i1 %16, label %17, label %18 +; GCN: %PrintBuffID = getelementptr i8, ptr addrspace(1) %printf_alloc_fn, i32 0 +; GCN: %PrintBuffIdCast = bitcast ptr addrspace(1) %PrintBuffID to ptr addrspace(1) +; GCN: store i32 1, ptr addrspace(1) %PrintBuffIdCast, align 4 +; GCN: %PrintBuffGep = getelementptr i8, ptr addrspace(1) %printf_alloc_fn, i32 4 +; GCN: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) %PrintBuffGep, ptr %1, i64 %11, i1 false) +; GCN: %PrintBuffNextPtr = getelementptr i8, ptr addrspace(1) %PrintBuffGep, i64 %13 +; GCN: br label %18 + +; METADATA: !llvm.printf.fmts = !{!0} +; METADATA: !0 = !{!"1:1:8:%s"} + +@.str = private unnamed_addr addrspace(4) constant [3 x i8] c"%s\00", align 1 +@.str.1 = private unnamed_addr addrspace(4) constant [6 x i8] c"hello\00", align 1 +@.str.2 = private unnamed_addr addrspace(4) constant [6 x i8] c"world\00", align 1 + + +define amdgpu_kernel void @test_kernel() { +entry: + %q = alloca ptr, align 8, addrspace(5) + %p = alloca i32, align 4, addrspace(5) + %q.ascast = addrspacecast ptr addrspace(5) %q to ptr + %p.ascast = addrspacecast ptr addrspace(5) %p to ptr + store i32 25, ptr %p.ascast, align 4 + %0 = load i32, ptr %p.ascast, align 4 + %cmp = icmp sgt i32 %0, 30 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + store ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), ptr %q.ascast, align 8 + br label %if.end + +if.else: ; preds = %entry + store ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), ptr %q.ascast, align 8 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %1 = load ptr, ptr %q.ascast, align 8 + %call = call i32 (ptr, ...) @printf(ptr noundef addrspacecast (ptr addrspace(4) @.str to ptr), ptr noundef %1) + ret void +} + +declare i32 @printf(ptr, ...) \ No newline at end of file Index: llvm/test/CodeGen/AMDGPU/opencl-printf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opencl-printf.ll +++ llvm/test/CodeGen/AMDGPU/opencl-printf.ll @@ -9,18 +9,17 @@ ; R600: call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n) ; GCN-LABEL: entry ; GCN: call i8 addrspace(1)* @__printf_alloc -; GCN-LABEL: entry.split +; GCN-LABEL: strlen.join.split ; GCN: icmp ne i8 addrspace(1)* %printf_alloc_fn, null +; GCN: br i1 %14, label %15, label %16 ; GCN: %PrintBuffID = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 0 ; GCN: %PrintBuffIdCast = bitcast i8 addrspace(1)* %PrintBuffID to i32 addrspace(1)* -; GCN: store i32 1, i32 addrspace(1)* %PrintBuffIdCast +; GCN: store i32 1, i32 addrspace(1)* %PrintBuffIdCast, align 4 ; GCN: %PrintBuffGep = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 4 -; GCN: %PrintArgPtr = ptrtoint i8* %arraydecay to i64 -; GCN: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffGep to i64 addrspace(1)* -; GCN: store i64 %PrintArgPtr, i64 addrspace(1)* %PrintBuffPtrCast -; GCN: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i32 8 -; GCN: %PrintBuffPtrCast1 = bitcast i8 addrspace(1)* %PrintBuffNextPtr to i32 addrspace(1)* -; GCN: store i32 %n, i32 addrspace(1)* %PrintBuffPtrCast1 +; GCN: call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %PrintBuffGep, i8* %arraydecay, i64 %9, i1 false) +; GCN: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i64 %11 +; GCN: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffNextPtr to i32 addrspace(1)* +; GCN: store i32 %n, i32 addrspace(1)* %PrintBuffPtrCast, align 4 @.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1