Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -2620,8 +2620,8 @@ Arg)); } case Builtin::BIprintf: - if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) - return EmitCUDADevicePrintfCallExpr(E, ReturnValue); + if (getTarget().getTriple().isNVPTX()) + return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue); break; case Builtin::BI__builtin_canonicalize: case Builtin::BI__builtin_canonicalizef: Index: cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp @@ -1,123 +0,0 @@ -//===----- CGCUDABuiltin.cpp - Codegen for CUDA builtins ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Generates code for built-in CUDA calls which are not runtime-specific. -// (Runtime-specific codegen lives in CGCUDARuntime.) -// -//===----------------------------------------------------------------------===// - -#include "CodeGenFunction.h" -#include "clang/Basic/Builtins.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Instruction.h" -#include "llvm/Support/MathExtras.h" - -using namespace clang; -using namespace CodeGen; - -static llvm::Function *GetVprintfDeclaration(llvm::Module &M) { - llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()), - llvm::Type::getInt8PtrTy(M.getContext())}; - llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get( - llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false); - - if (auto* F = M.getFunction("vprintf")) { - // Our CUDA system header declares vprintf with the right signature, so - // nobody else should have been able to declare vprintf with a bogus - // signature. - assert(F->getFunctionType() == VprintfFuncType); - return F; - } - - // vprintf doesn't already exist; create a declaration and insert it into the - // module. - return llvm::Function::Create( - VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M); -} - -// Transforms a call to printf into a call to the NVPTX vprintf syscall (which -// isn't particularly special; it's invoked just like a regular function). -// vprintf takes two args: A format string, and a pointer to a buffer containing -// the varargs. -// -// For example, the call -// -// printf("format string", arg1, arg2, arg3); -// -// is converted into something resembling -// -// struct Tmp { -// Arg1 a1; -// Arg2 a2; -// Arg3 a3; -// }; -// char* buf = alloca(sizeof(Tmp)); -// *(Tmp*)buf = {a1, a2, a3}; -// vprintf("format string", buf); -// -// buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the -// args is itself aligned to its preferred alignment. -// -// Note that by the time this function runs, E's args have already undergone the -// standard C vararg promotion (short -> int, float -> double, etc.). -RValue -CodeGenFunction::EmitCUDADevicePrintfCallExpr(const CallExpr *E, - ReturnValueSlot ReturnValue) { - assert(getLangOpts().CUDA); - assert(getLangOpts().CUDAIsDevice); - assert(E->getBuiltinCallee() == Builtin::BIprintf); - assert(E->getNumArgs() >= 1); // printf always has at least one arg. - - const llvm::DataLayout &DL = CGM.getDataLayout(); - llvm::LLVMContext &Ctx = CGM.getLLVMContext(); - - CallArgList Args; - EmitCallArgs(Args, - E->getDirectCallee()->getType()->getAs(), - E->arguments(), E->getDirectCallee(), - /* ParamsToSkip = */ 0); - - // We don't know how to emit non-scalar varargs. - if (std::any_of(Args.begin() + 1, Args.end(), - [](const CallArg &A) { return !A.RV.isScalar(); })) { - CGM.ErrorUnsupported(E, "non-scalar arg to printf"); - return RValue::get(llvm::ConstantInt::get(IntTy, 0)); - } - - // Construct and fill the args buffer that we'll pass to vprintf. - llvm::Value *BufferPtr; - if (Args.size() <= 1) { - // If there are no args, pass a null pointer to vprintf. - BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); - } else { - llvm::SmallVector ArgTypes; - for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) - ArgTypes.push_back(Args[I].RV.getScalarVal()->getType()); - - // Using llvm::StructType is correct only because printf doesn't accept - // aggregates. If we had to handle aggregates here, we'd have to manually - // compute the offsets within the alloca -- we wouldn't be able to assume - // that the alignment of the llvm type was the same as the alignment of the - // clang type. - llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args"); - llvm::Value *Alloca = CreateTempAlloca(AllocaTy); - - for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { - llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1); - llvm::Value *Arg = Args[I].RV.getScalarVal(); - Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType())); - } - BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx)); - } - - // Invoke vprintf and return. - llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule()); - return RValue::get( - Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr})); -} Index: cfe/trunk/lib/CodeGen/CGGPUBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGGPUBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGGPUBuiltin.cpp @@ -0,0 +1,122 @@ +//===------ CGGPUBuiltin.cpp - Codegen for GPU builtins -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Generates code for built-in GPU calls which are not runtime-specific. +// (Runtime-specific codegen lives in programming model specific files.) +// +//===----------------------------------------------------------------------===// + +#include "CodeGenFunction.h" +#include "clang/Basic/Builtins.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Support/MathExtras.h" + +using namespace clang; +using namespace CodeGen; + +static llvm::Function *GetVprintfDeclaration(llvm::Module &M) { + llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()), + llvm::Type::getInt8PtrTy(M.getContext())}; + llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get( + llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false); + + if (auto* F = M.getFunction("vprintf")) { + // Our CUDA system header declares vprintf with the right signature, so + // nobody else should have been able to declare vprintf with a bogus + // signature. + assert(F->getFunctionType() == VprintfFuncType); + return F; + } + + // vprintf doesn't already exist; create a declaration and insert it into the + // module. + return llvm::Function::Create( + VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M); +} + +// Transforms a call to printf into a call to the NVPTX vprintf syscall (which +// isn't particularly special; it's invoked just like a regular function). +// vprintf takes two args: A format string, and a pointer to a buffer containing +// the varargs. +// +// For example, the call +// +// printf("format string", arg1, arg2, arg3); +// +// is converted into something resembling +// +// struct Tmp { +// Arg1 a1; +// Arg2 a2; +// Arg3 a3; +// }; +// char* buf = alloca(sizeof(Tmp)); +// *(Tmp*)buf = {a1, a2, a3}; +// vprintf("format string", buf); +// +// buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the +// args is itself aligned to its preferred alignment. +// +// Note that by the time this function runs, E's args have already undergone the +// standard C vararg promotion (short -> int, float -> double, etc.). +RValue +CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E, + ReturnValueSlot ReturnValue) { + assert(getTarget().getTriple().isNVPTX()); + assert(E->getBuiltinCallee() == Builtin::BIprintf); + assert(E->getNumArgs() >= 1); // printf always has at least one arg. + + const llvm::DataLayout &DL = CGM.getDataLayout(); + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); + + CallArgList Args; + EmitCallArgs(Args, + E->getDirectCallee()->getType()->getAs(), + E->arguments(), E->getDirectCallee(), + /* ParamsToSkip = */ 0); + + // We don't know how to emit non-scalar varargs. + if (std::any_of(Args.begin() + 1, Args.end(), + [](const CallArg &A) { return !A.RV.isScalar(); })) { + CGM.ErrorUnsupported(E, "non-scalar arg to printf"); + return RValue::get(llvm::ConstantInt::get(IntTy, 0)); + } + + // Construct and fill the args buffer that we'll pass to vprintf. + llvm::Value *BufferPtr; + if (Args.size() <= 1) { + // If there are no args, pass a null pointer to vprintf. + BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); + } else { + llvm::SmallVector ArgTypes; + for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) + ArgTypes.push_back(Args[I].RV.getScalarVal()->getType()); + + // Using llvm::StructType is correct only because printf doesn't accept + // aggregates. If we had to handle aggregates here, we'd have to manually + // compute the offsets within the alloca -- we wouldn't be able to assume + // that the alignment of the llvm type was the same as the alignment of the + // clang type. + llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args"); + llvm::Value *Alloca = CreateTempAlloca(AllocaTy); + + for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { + llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1); + llvm::Value *Arg = Args[I].RV.getScalarVal(); + Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType())); + } + BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx)); + } + + // Invoke vprintf and return. + llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule()); + return RValue::get( + Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr})); +} Index: cfe/trunk/lib/CodeGen/CMakeLists.txt =================================================================== --- cfe/trunk/lib/CodeGen/CMakeLists.txt +++ cfe/trunk/lib/CodeGen/CMakeLists.txt @@ -36,7 +36,6 @@ CGAtomic.cpp CGBlocks.cpp CGBuiltin.cpp - CGCUDABuiltin.cpp CGCUDANV.cpp CGCUDARuntime.cpp CGCXX.cpp @@ -55,6 +54,7 @@ CGExprComplex.cpp CGExprConstant.cpp CGExprScalar.cpp + CGGPUBuiltin.cpp CGLoopInfo.cpp CGObjC.cpp CGObjCGNU.cpp Index: cfe/trunk/lib/CodeGen/CodeGenFunction.h =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenFunction.h +++ cfe/trunk/lib/CodeGen/CodeGenFunction.h @@ -3106,8 +3106,8 @@ RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue); - RValue EmitCUDADevicePrintfCallExpr(const CallExpr *E, - ReturnValueSlot ReturnValue); + RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E, + ReturnValueSlot ReturnValue); RValue EmitBuiltinExpr(const FunctionDecl *FD, unsigned BuiltinID, const CallExpr *E, Index: cfe/trunk/test/OpenMP/nvptx_target_printf_codegen.c =================================================================== --- cfe/trunk/test/OpenMP/nvptx_target_printf_codegen.c +++ cfe/trunk/test/OpenMP/nvptx_target_printf_codegen.c @@ -0,0 +1,116 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +#include + +// expected-no-diagnostics +extern int printf(const char *, ...); +extern int vprintf(const char *, va_list); + +// Check a simple call to printf end-to-end. +// CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double } +int CheckSimple() { + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+CheckSimple.+]]_worker() +#pragma omp target + { + // Entry point. + // CHECK: define {{.*}}void [[T1]]() + // Alloca in entry block. + // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]] + + // CHECK: {{call|invoke}} void [[T1]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + + // printf in master-only basic block. + // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt + const char* fmt = "%d %lld %f"; + // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0 + // CHECK: store i32 1, i32* [[PTR0]], align 4 + // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1 + // CHECK: store i64 2, i64* [[PTR1]], align 8 + // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2 + + // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8 + // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8* + // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]]) + printf(fmt, 1, 2ll, 3.0); + } + + return 0; +} + +void CheckNoArgs() { + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+CheckNoArgs.+]]_worker() +#pragma omp target + { + // Entry point. + // CHECK: define {{.*}}void [[T2]]() + + // CHECK: {{call|invoke}} void [[T2]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + + // printf in master-only basic block. + // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} + printf("hello, world!"); + } +} + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +int foo; +void CheckAllocaIsInEntryBlock() { + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+CheckAllocaIsInEntryBlock.+]]_worker() +#pragma omp target + { + // Entry point. + // CHECK: define {{.*}}void [[T3]]( + // Alloca in entry block. + // CHECK: alloca %printf_args + + // CHECK: {{call|invoke}} void [[T3]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + + if (foo) { + printf("%d", 42); + } + } +}