Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -1963,6 +1963,9 @@ return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy)); break; } + case Builtin::BIprintf: + if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) + return EmitCUDADevicePrintfCallExpr(E, ReturnValue); } // If this is an alias for a lib function (e.g. __builtin_sin), emit Index: cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp @@ -0,0 +1,131 @@ +//===----- CGCUDABuiltin.cpp - Codegen for CUDA builtins ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Generates code for built-in CUDA calls which are not runtime-specific. +// (Runtime-specific codegen lives in CGCUDARuntime.) +// +//===----------------------------------------------------------------------===// + +#include "CodeGenFunction.h" +#include "clang/Basic/Builtins.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Support/MathExtras.h" + +using namespace clang; +using namespace CodeGen; + +static llvm::Function *GetVprintfDeclaration(llvm::Module &M) { + llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()), + llvm::Type::getInt8PtrTy(M.getContext())}; + llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get( + llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false); + + if (auto* F = M.getFunction("vprintf")) { + // Our CUDA system header declares vprintf with the right signature, so + // nobody else should have been able to declare vprintf with a bogus + // signature. + assert(F->getFunctionType() == VprintfFuncType); + return F; + } + + // vprintf doesn't already exist; create a declaration and insert it into the + // module. + return llvm::Function::Create( + VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M); +} + +// Transforms a call to printf into a call to the NVPTX vprintf syscall (which +// isn't particularly special; it's invoked just like a regular function). +// vprintf takes two args: A format string, and a pointer to a buffer containing +// the varargs. +// +// For example, the call +// +// printf("format string", arg1, arg2, arg3); +// +// is converted into something resembling +// +// char* buf = alloca(...); +// *reinterpret_cast(buf) = arg1; +// *reinterpret_cast(buf + ...) = arg2; +// *reinterpret_cast(buf + ...) = arg3; +// vprintf("format string", buf); +// +// buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the +// args is itself aligned to its preferred alignment. +// +// Note that by the time this function runs, E's args have already undergone the +// standard C vararg promotion (short -> int, float -> double, etc.). +RValue +CodeGenFunction::EmitCUDADevicePrintfCallExpr(const CallExpr *E, + ReturnValueSlot ReturnValue) { + assert(getLangOpts().CUDA); + assert(getLangOpts().CUDAIsDevice); + assert(E->getBuiltinCallee() == Builtin::BIprintf); + assert(E->getNumArgs() >= 1); // printf always has at least one arg. + + const llvm::DataLayout &DL = CGM.getDataLayout(); + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); + + CallArgList Args; + EmitCallArgs(Args, + E->getDirectCallee()->getType()->getAs(), + E->arguments(), E->getDirectCallee(), + /* ParamsToSkip = */ 0); + + // Figure out how large of a buffer we need to hold our varargs and how + // aligned the buffer needs to be. We start iterating at Arg[1], because + // that's our first vararg. + unsigned BufSize = 0; + unsigned BufAlign = 0; + for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { + const RValue& RV = Args[I].RV; + llvm::Type* Ty = RV.getScalarVal()->getType(); + + auto Align = DL.getPrefTypeAlignment(Ty); + BufAlign = std::max(BufAlign, Align); + // Add padding required to keep the current arg aligned. + BufSize = llvm::alignTo(BufSize, Align); + BufSize += DL.getTypeAllocSize(Ty); + } + + // Construct and fill the buffer. + llvm::Value* BufferPtr = nullptr; + if (BufSize == 0) { + // If there are no args, pass a null pointer to vprintf. + BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); + } else { + BufferPtr = Builder.Insert(new llvm::AllocaInst( + llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize), + BufAlign, "printf_arg_buf")); + + unsigned Offset = 0; + for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { + llvm::Value *Arg = Args[I].RV.getScalarVal(); + llvm::Type *Ty = Arg->getType(); + auto Align = DL.getPrefTypeAlignment(Ty); + + // Pad the buffer to Arg's alignment. + Offset = llvm::alignTo(Offset, Align); + + // Store Arg into the buffer at Offset. + llvm::Value *GEP = + Builder.CreateGEP(BufferPtr, llvm::ConstantInt::get(Int32Ty, Offset)); + llvm::Value *Cast = Builder.CreateBitCast(GEP, Ty->getPointerTo()); + Builder.CreateAlignedStore(Arg, Cast, Align); + Offset += DL.getTypeAllocSize(Ty); + } + } + + // Invoke vprintf and return. + llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule()); + return RValue::get( + Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr})); +} Index: cfe/trunk/lib/CodeGen/CMakeLists.txt =================================================================== --- cfe/trunk/lib/CodeGen/CMakeLists.txt +++ cfe/trunk/lib/CodeGen/CMakeLists.txt @@ -32,6 +32,7 @@ CGAtomic.cpp CGBlocks.cpp CGBuiltin.cpp + CGCUDABuiltin.cpp CGCUDANV.cpp CGCUDARuntime.cpp CGCXX.cpp Index: cfe/trunk/lib/CodeGen/CodeGenFunction.h =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenFunction.h +++ cfe/trunk/lib/CodeGen/CodeGenFunction.h @@ -2711,6 +2711,8 @@ RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue); + RValue EmitCUDADevicePrintfCallExpr(const CallExpr *E, + ReturnValueSlot ReturnValue); RValue EmitBuiltinExpr(const FunctionDecl *FD, unsigned BuiltinID, const CallExpr *E, Index: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -210,6 +210,11 @@ static __device__ __attribute__((used)) int __nvvm_reflect_anchor() { return __nvvm_reflect("NONE"); } + +// The nvptx vprintf syscall. This doesn't actually need to be declared, but we +// declare it so that if someone else declares it with a different signature, +// we'll throw an error. +extern "C" __device__ int vprintf(const char*, const char*); #endif #endif // __CUDA__ Index: cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h =================================================================== --- cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h +++ cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h @@ -18,3 +18,5 @@ int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, cudaStream_t stream = 0); + +extern "C" __device__ int printf(const char*, ...); Index: cfe/trunk/test/CodeGenCUDA/printf.cu =================================================================== --- cfe/trunk/test/CodeGenCUDA/printf.cu +++ cfe/trunk/test/CodeGenCUDA/printf.cu @@ -0,0 +1,53 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \ +// RUN: -o - %s | FileCheck %s + +#include "Inputs/cuda.h" + +extern "C" __device__ int vprintf(const char*, const char*); + +// Check a simple call to printf end-to-end. +__device__ int CheckSimple() { + // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt + const char* fmt = "%d"; + // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4 + // CHECK: [[PTR:%[0-9]+]] = getelementptr i8, i8* [[BUF]], i32 0 + // CHECK: [[CAST:%[0-9]+]] = bitcast i8* [[PTR]] to i32* + // CHECK: store i32 42, i32* [[CAST]], align 4 + // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF]]) + // CHECK: ret i32 [[RET]] + return printf(fmt, 42); +} + +// Check that the args' types are promoted correctly when we call printf. +__device__ void CheckTypes() { + // CHECK: alloca {{.*}} align 8 + // CHECK: getelementptr {{.*}} i32 0 + // CHECK: bitcast {{.*}} to i32* + // CHECK: getelementptr {{.*}} i32 4 + // CHECK: bitcast {{.*}} to i32* + // CHECK: getelementptr {{.*}} i32 8 + // CHECK: bitcast {{.*}} to double* + // CHECK: getelementptr {{.*}} i32 16 + // CHECK: bitcast {{.*}} to double* + printf("%d %d %f %f", (char)1, (short)2, 3.0f, 4.0); +} + +// Check that the args are aligned properly in the buffer. +__device__ void CheckAlign() { + // CHECK: alloca i8, i32 40, align 8 + // CHECK: getelementptr {{.*}} i32 0 + // CHECK: getelementptr {{.*}} i32 8 + // CHECK: getelementptr {{.*}} i32 16 + // CHECK: getelementptr {{.*}} i32 20 + // CHECK: getelementptr {{.*}} i32 24 + // CHECK: getelementptr {{.*}} i32 32 + printf("%d %f %d %d %d %lld", 1, 2.0, 3, 4, 5, (long long)6); +} + +__device__ void CheckNoArgs() { + // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} + printf("hello, world!"); +}