diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4115,6 +4115,9 @@ case Builtin::BIprintf: if (getTarget().getTriple().isNVPTX()) return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue); + if (getTarget().getTriple().getArch() == Triple::amdgcn && + getLangOpts().HIP) + return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue); break; case Builtin::BI__builtin_canonicalize: case Builtin::BI__builtin_canonicalizef: diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -16,6 +16,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instruction.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" using namespace clang; using namespace CodeGen; @@ -120,3 +121,36 @@ return RValue::get(Builder.CreateCall( VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr})); } + +RValue +CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E, + ReturnValueSlot ReturnValue) { + assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn); + assert(E->getBuiltinCallee() == Builtin::BIprintf || + E->getBuiltinCallee() == Builtin::BI__builtin_printf); + assert(E->getNumArgs() >= 1); // printf always has at least one arg. + + CallArgList CallArgs; + EmitCallArgs(CallArgs, + E->getDirectCallee()->getType()->getAs(), + E->arguments(), E->getDirectCallee(), + /* ParamsToSkip = */ 0); + + SmallVector Args; + for (auto A : CallArgs) { + // We don't know how to emit non-scalar varargs. + if (!A.getRValue(*this).isScalar()) { + CGM.ErrorUnsupported(E, "non-scalar arg to printf"); + return RValue::get(llvm::ConstantInt::get(IntTy, -1)); + } + + llvm::Value *Arg = A.getRValue(*this).getScalarVal(); + Args.push_back(Arg); + } + + llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint()); + IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation()); + auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args); + Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint()); + return RValue::get(Printf); +} diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3722,6 +3722,8 @@ RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E, ReturnValueSlot ReturnValue); + RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E, + ReturnValueSlot ReturnValue); RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue); diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -436,6 +436,7 @@ void HIPToolChain::addClangWarningOptions(ArgStringList &CC1Args) const { HostTC.addClangWarningOptions(CC1Args); + CC1Args.push_back("-Werror=format-nonliteral"); } ToolChain::CXXStdlibType diff --git a/clang/test/CodeGenHIP/printf-aggregate.cpp b/clang/test/CodeGenHIP/printf-aggregate.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenHIP/printf-aggregate.cpp @@ -0,0 +1,20 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fcuda-is-device \ +// RUN: -verify -emit-llvm %s + +#define __device__ __attribute__((device)) +extern "C" __device__ int printf(const char *format, ...); + +// Check that we don't crash when asked to printf a non-scalar arg. +struct Struct { + int x; + int y; +}; + +__device__ void PrintfNonScalar(const char *fmt) { + printf(fmt, 1); + // Ignore the warning about the %d not matching the struct argument + // expected-warning@+2 {{}} + // expected-error@+1 {{cannot compile this non-scalar arg to printf}} + printf("%d", Struct()); +} diff --git a/clang/test/CodeGenHIP/printf.cpp b/clang/test/CodeGenHIP/printf.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenHIP/printf.cpp @@ -0,0 +1,44 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck --enable-var-scope %s + +#define __device__ __attribute__((device)) + +extern "C" __device__ int printf(const char *format, ...); + +__device__ int foo1() { + const char *s = "hello world"; + return printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 4, s, s); +} + +// CHECK-LABEL: @_Z4foo1v() +// CHECK: [[BEGIN:%.*]] = call i64 @__ockl_printf_begin(i64 0) +// CHECK: [[STRLEN1:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ] +// CHECK: [[APPEND1:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[BEGIN]], {{.*}}, i64 [[STRLEN1]], i32 0) +// CHECK: [[APPEND2:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND1]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0) +// CHECK: [[APPEND3:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND2]], i32 1, i64 4614256650576692846, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0) +// CHECK: [[APPEND4:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND3]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0) +// CHECK: [[APPEND5:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND4]], i32 1, i64 4, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0) +// CHECK: [[STRLEN2:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ] +// CHECK: [[APPEND6:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[APPEND5]], {{.*}}, i64 [[STRLEN2]], i32 0) +// CHECK: [[PTR2INT:%.*]] = ptrtoint i8* %{{.*}} to i64 +// CHECK: [[APPEND7:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND6]], i32 1, i64 [[PTR2INT]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1) +// CHECK: [[RETURN:%.*]] = trunc i64 [[APPEND7]] to i32 +// CHECK: ret i32 [[RETURN]] + +__device__ char *dstr; + +__device__ int foo2() { + return printf("%s %p\n", dstr, dstr); +} + +// CHECK-LABEL: @_Z4foo2v() +// CHECK: [[BEGIN:%.*]] = call i64 @__ockl_printf_begin(i64 0) +// CHECK: [[STRLEN1:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ] +// CHECK: [[APPEND1:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[BEGIN]], {{.*}}, i64 [[STRLEN1]], i32 0) +// CHECK: [[STRLEN2:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ] +// CHECK: [[APPEND2:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[APPEND1]], {{.*}}, i64 [[STRLEN2]], i32 0) +// CHECK: [[PTR2INT:%.*]] = ptrtoint i8* %{{.*}} to i64 +// CHECK: [[APPEND3:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND2]], i32 1, i64 [[PTR2INT]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1) +// CHECK: [[RETURN:%.*]] = trunc i64 [[APPEND3]] to i32 +// CHECK: ret i32 [[RETURN]] diff --git a/clang/test/Driver/hip-printf.hip b/clang/test/Driver/hip-printf.hip new file mode 100644 --- /dev/null +++ b/clang/test/Driver/hip-printf.hip @@ -0,0 +1,9 @@ +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: amdgpu-registered-target + +// RUN: %clang -### -target x86_64-linux-gnu -x hip --cuda-gpu-arch=gfx900 \ +// RUN: %s 2>&1 | FileCheck %s + +// CHECK: [[CLANG:".*clang.*"]] "-cc1" +// CHECK-SAME: "-Werror=format-nonliteral" diff --git a/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h b/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h @@ -0,0 +1,25 @@ +//===- AMDGPUEmitPrintf.h ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Utility function to lower a printf call into a series of device +// library calls on the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_AMDGPUEMITPRINTF_H +#define LLVM_TRANSFORMS_UTILS_AMDGPUEMITPRINTF_H + +#include "llvm/IR/IRBuilder.h" + +namespace llvm { + +Value *emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef Args); + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_AMDGPUEMITPRINTF_H diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp @@ -0,0 +1,246 @@ +//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Utility function to lower a printf call into a series of device +// library calls on the AMDGPU target. +// +// WARNING: This file knows about certain library functions. It recognizes them +// by name, and hardwires knowledge of their semantics. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" + +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-emit-printf" + +static bool isCString(const Value *Arg) { + auto Ty = Arg->getType(); + auto PtrTy = dyn_cast(Ty); + if (!PtrTy) + return false; + + auto IntTy = dyn_cast(PtrTy->getElementType()); + if (!IntTy) + return false; + + return IntTy->getBitWidth() == 8; +} + +static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) { + auto Int64Ty = Builder.getInt64Ty(); + auto Ty = Arg->getType(); + + if (auto IntTy = dyn_cast(Ty)) { + switch (IntTy->getBitWidth()) { + case 32: + return Builder.CreateZExt(Arg, Int64Ty); + case 64: + return Arg; + } + } + + if (Ty->getTypeID() == Type::DoubleTyID) { + return Builder.CreateBitCast(Arg, Int64Ty); + } + + if (auto PtrTy = dyn_cast(Ty)) { + return Builder.CreatePtrToInt(Arg, Int64Ty); + } + + llvm_unreachable("unexpected type"); +} + +static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) { + auto Int64Ty = Builder.getInt64Ty(); + auto M = Builder.GetInsertBlock()->getModule(); + auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty); + return Builder.CreateCall(Fn, Version); +} + +static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs, + Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3, + Value *Arg4, Value *Arg5, Value *Arg6, + bool IsLast) { + auto Int64Ty = Builder.getInt64Ty(); + auto Int32Ty = Builder.getInt32Ty(); + auto M = Builder.GetInsertBlock()->getModule(); + auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty, + Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty, + Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty); + auto IsLastValue = Builder.getInt32(IsLast); + auto NumArgsValue = Builder.getInt32(NumArgs); + return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3, + Arg4, Arg5, Arg6, IsLastValue}); +} + +static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool IsLast) { + auto Arg0 = fitArgInto64Bits(Builder, Arg); + auto Zero = Builder.getInt64(0); + return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero, + Zero, IsLast); +} + +// The device library does not provide strlen, so we build our own loop +// here. While we are at it, we also include the terminating null in the length. +static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) { + auto *Prev = Builder.GetInsertBlock(); + Module *M = Prev->getModule(); + + auto CharZero = Builder.getInt8(0); + auto One = Builder.getInt64(1); + auto Zero = Builder.getInt64(0); + auto Int64Ty = Builder.getInt64Ty(); + + // The length is either zero for a null pointer, or the computed value for an + // actual string. We need a join block for a phi that represents the final + // value. + // + // Strictly speaking, the zero does not matter since + // __ockl_printf_append_string_n ignores the length if the pointer is null. + BasicBlock *Join = nullptr; + if (Prev->getTerminator()) { + Join = Prev->splitBasicBlock(Builder.GetInsertPoint(), + "strlen.join"); + Prev->getTerminator()->eraseFromParent(); + } else { + Join = BasicBlock::Create(M->getContext(), "strlen.join", + Prev->getParent()); + } + BasicBlock *While = + BasicBlock::Create(M->getContext(), "strlen.while", + Prev->getParent(), Join); + BasicBlock *WhileDone = BasicBlock::Create( + M->getContext(), "strlen.while.done", + Prev->getParent(), Join); + + // Emit an early return for when the pointer is null. + Builder.SetInsertPoint(Prev); + auto CmpNull = + Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType())); + BranchInst::Create(Join, While, CmpNull, Prev); + + // Entry to the while loop. + Builder.SetInsertPoint(While); + + auto PtrPhi = Builder.CreatePHI(Str->getType(), 2); + PtrPhi->addIncoming(Str, Prev); + auto PtrNext = Builder.CreateGEP(PtrPhi, One); + PtrPhi->addIncoming(PtrNext, While); + + // Condition for the while loop. + auto Data = Builder.CreateLoad(PtrPhi); + auto Cmp = Builder.CreateICmpEQ(Data, CharZero); + Builder.CreateCondBr(Cmp, WhileDone, While); + + // Add one to the computed length. + Builder.SetInsertPoint(WhileDone, WhileDone->begin()); + auto Begin = Builder.CreatePtrToInt(Str, Int64Ty); + auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty); + auto Len = Builder.CreateSub(End, Begin); + Len = Builder.CreateAdd(Len, One); + + // Final join. + BranchInst::Create(Join, WhileDone); + Builder.SetInsertPoint(Join, Join->begin()); + auto LenPhi = Builder.CreatePHI(Len->getType(), 2); + LenPhi->addIncoming(Len, WhileDone); + LenPhi->addIncoming(Zero, Prev); + + return LenPhi; +} + +static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str, + Value *Length, bool isLast) { + auto Int64Ty = Builder.getInt64Ty(); + auto CharPtrTy = Builder.getInt8PtrTy(); + auto Int32Ty = Builder.getInt32Ty(); + auto M = Builder.GetInsertBlock()->getModule(); + auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty, + Int64Ty, CharPtrTy, Int64Ty, Int32Ty); + auto IsLastInt32 = Builder.getInt32(isLast); + return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32}); +} + +static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool IsLast) { + auto Length = getStrlenWithNull(Builder, Arg); + return callAppendStringN(Builder, Desc, Arg, Length, IsLast); +} + +static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool SpecIsCString, bool IsLast) { + if (SpecIsCString && isCString(Arg)) { + return appendString(Builder, Desc, Arg, IsLast); + } + // If the format specifies a string but the argument is not, the frontend will + // have printed a warning. We just rely on undefined behaviour and send the + // argument anyway. + return appendArg(Builder, Desc, Arg, IsLast); +} + +// Scan the format string to locate all specifiers, and mark the ones that +// specify a string, i.e, the "%s" specifier with optional '*' characters. +static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) { + StringRef Str; + if (!getConstantStringInfo(Fmt, Str) || Str.empty()) + return; + + static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn"; + size_t SpecPos = 0; + // Skip the first argument, the format string. + unsigned ArgIdx = 1; + + while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) { + if (Str[SpecPos + 1] == '%') { + SpecPos += 2; + continue; + } + auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos); + if (SpecEnd == StringRef::npos) + return; + auto Spec = Str.slice(SpecPos, SpecEnd + 1); + ArgIdx += Spec.count('*'); + if (Str[SpecEnd] == 's') { + BV.set(ArgIdx); + } + SpecPos = SpecEnd + 1; + ++ArgIdx; + } +} + +Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, + ArrayRef Args) { + auto NumOps = Args.size(); + assert(NumOps >= 1); + + auto Fmt = Args[0]; + SparseBitVector<8> SpecIsCString; + locateCStrings(SpecIsCString, Fmt); + + auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0)); + Desc = appendString(Builder, Desc, Fmt, NumOps == 1); + + // FIXME: This invokes hostcall once for each argument. We can pack up to + // seven scalar printf arguments in a single hostcall. See the signature of + // callAppendArgs(). + for (unsigned int i = 1; i != NumOps; ++i) { + bool IsLast = i == NumOps - 1; + bool IsCString = SpecIsCString.test(i); + Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast); + } + + return Builder.CreateTrunc(Desc, Builder.getInt32Ty()); +} diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_component_library(LLVMTransformUtils + AMDGPUEmitPrintf.cpp ASanStackFrameLayout.cpp AddDiscriminators.cpp BasicBlockUtils.cpp