diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -588,6 +588,12 @@ --undefined`` if using an offloading language. - The deprecated ``-mcode-object-v3`` and ``-mno-code-object-v3`` command-line options have been removed. +- A new option ``-mprintf-kind`` has been introduced that controls printf lowering + scheme. It is currently supported only for HIP and takes following values, + ``hostcall`` - printing happens during kernel execution via series of hostcalls, + The scheme requires the system to support pcie atomics.(default) + ``buffered`` - Scheme uses a debug buffer to populate printf varargs, does not + rely on pcie atomics support. X86 Support ^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -90,6 +90,19 @@ /// \brief Code object version for AMDGPU. CodeObjectVersionKind CodeObjectVersion = CodeObjectVersionKind::COV_None; + /// \brief Enumeration values for AMDGPU printf lowering scheme + enum class AMDGPUPrintfKind { + /// printf lowering scheme involving hostcalls, currently used by HIP + /// programs by default + Hostcall = 0, + + /// printf lowering scheme involving implicit printf buffers, + Buffered = 1, + }; + + /// \brief AMDGPU Printf lowering scheme + AMDGPUPrintfKind AMDGPUPrintfKindVal = AMDGPUPrintfKind::Hostcall; + // The code model to be used as specified by the user. Corresponds to // CodeModel::Model enum defined in include/llvm/Support/CodeGen.h, plus // "default" for the case when the user has not explicitly specified a diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1033,6 +1033,17 @@ TargetOpts<"NVPTXUseShortPointers">, DefaultFalse, PosFlag, NegFlag>; +def mprintf_kind_EQ : Joined<["-"], "mprintf-kind=">, Group, + HelpText<"Specify the printf lowering scheme (AMDGPU only), allowed values are " + "\"hostcall\"(printing happens during kernel execution, this scheme " + "relies on hostcalls which require system to support pcie atomics) " + "and \"buffered\"(printing happens after all kernel threads exit, " + "this uses a printf buffer and does not rely on pcie atomic support)">, + Flags<[CC1Option]>, + Values<"hostcall,buffered">, + NormalizedValuesScope<"TargetOptions::AMDGPUPrintfKind">, + NormalizedValues<["Hostcall", "Buffered"]>, + MarshallingInfoEnum, "Hostcall">; def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">, HelpText<"Specify default stream. The default value is 'legacy'. (HIP only)">, Flags<[CC1Option]>, diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -202,7 +202,10 @@ llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint()); IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation()); - auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args); + + bool isBuffered = (CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal == + clang::TargetOptions::AMDGPUPrintfKind::Buffered); + auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args, isBuffered); Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint()); return RValue::get(Printf); } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -611,6 +611,17 @@ "amdgpu_code_object_version", getTarget().getTargetOpts().CodeObjectVersion); } + + // Currently, "-mprintf-kind" option is only supported for HIP + if (LangOpts.HIP) { + auto *MDStr = llvm::MDString::get( + getLLVMContext(), (getTarget().getTargetOpts().AMDGPUPrintfKindVal == + TargetOptions::AMDGPUPrintfKind::Hostcall) + ? "hostcall" + : "buffered"); + getModule().addModuleFlag(llvm::Module::Error, "amdgpu_printf_kind", + MDStr); + } } // Emit a global array containing all external kernels or device variables diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4694,8 +4694,25 @@ } CmdArgs.push_back("-aux-triple"); CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); + + if (JA.isDeviceOffloading(Action::OFK_HIP) && + getToolChain().getTriple().isAMDGPU()) { + // Device side compilation printf + if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) { + CmdArgs.push_back(Args.MakeArgString( + "-mprintf-kind=" + + Args.getLastArgValue(options::OPT_mprintf_kind_EQ))); + // Force compiler error on invalid conversion specifiers + CmdArgs.push_back( + Args.MakeArgString("-Werror=format-invalid-specifier")); + } + } } + // Unconditionally claim the printf option now to avoid unused diagnostic. + if (const Arg *PF = Args.getLastArg(options::OPT_mprintf_kind_EQ)) + PF->claim(); + if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false)) { CmdArgs.push_back("-fsycl-is-device"); diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip --- a/clang/test/CodeGenHIP/default-attributes.hip +++ b/clang/test/CodeGenHIP/default-attributes.hip @@ -47,8 +47,10 @@ // OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } //. // OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} -// OPTNONE: !1 = !{i32 1, !"wchar_size", i32 4} +// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4} //. // OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} -// OPT: !1 = !{i32 1, !"wchar_size", i32 4} +// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPT: !2 = !{i32 1, !"wchar_size", i32 4} //. diff --git a/clang/test/CodeGenHIP/printf-kind-module-flag.hip b/clang/test/CodeGenHIP/printf-kind-module-flag.hip new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenHIP/printf-kind-module-flag.hip @@ -0,0 +1,17 @@ +// Create module flag for printf kind. + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -o - %s | FileCheck %s -check-prefix=HOSTCALL + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mprintf-kind=hostcall -o - %s | FileCheck %s -check-prefix=HOSTCALL + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mprintf-kind=buffered -o - %s | FileCheck -check-prefix=BUFFERED %s + +// RUN: not %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mprintf-kind=none -o - %s 2>&1| FileCheck %s -check-prefix=INV + +// HOSTCALL: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// BUFFERED: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"buffered"} +// INV: error: invalid value 'none' in '-mprintf-kind=none' diff --git a/clang/test/CodeGenHIP/printf_nonhostcall.cpp b/clang/test/CodeGenHIP/printf_nonhostcall.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenHIP/printf_nonhostcall.cpp @@ -0,0 +1,259 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -mprintf-kind=buffered -fcuda-is-device \ +// RUN: -o - %s | FileCheck --enable-var-scope %s + +#define __device__ __attribute__((device)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +extern "C" __device__ int printf(const char *format, ...); + +// CHECK-LABEL: define dso_local noundef i32 @_Z4foo1v +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr +// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null +// CHECK-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK: strlen.while: +// CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ] +// CHECK-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0 +// CHECK-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK: strlen.while.done: +// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP0]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]] +// CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1 +// CHECK-NEXT: br label [[STRLEN_JOIN]] +// CHECK: strlen.join: +// CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7 +// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288 +// CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 52 +// CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]]) +// CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK: end.block: +// CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32 +// CHECK-NEXT: ret i32 [[PRINTF_RESULT]] +// CHECK: argpush.block: +// CHECK-NEXT: [[TMP18:%.*]] = shl i32 [[TMP15]], 2 +// CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], 2 +// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK-NEXT: store i64 1107004088646384690, ptr addrspace(1) [[TMP20]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 8 +// CHECK-NEXT: store i64 8, ptr addrspace(1) [[TMP21]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 8 +// CHECK-NEXT: store double 3.141590e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK-NEXT: store i64 8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8 +// CHECK-NEXT: store i64 4, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8 +// CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[PRINTBUFFNEXTPTR3]], ptr align 1 [[TMP0]], i64 [[TMP11]], i1 false) +// CHECK-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i64 [[TMP13]] +// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 8 +// CHECK-NEXT: br label [[END_BLOCK]] +// +__device__ int foo1() { + const char *s = "hello world"; + return printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 4, s, s); +} + +__device__ char *dstr; +__device__ const +// CHECK-LABEL: define dso_local noundef i32 @_Z4foo2v +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[LCVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[LCVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LCVAL]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @dstr to ptr), align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @dstr to ptr), align 8 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null +// CHECK-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK: strlen.while: +// CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ] +// CHECK-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0 +// CHECK-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK: strlen.while.done: +// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP0]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]] +// CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1 +// CHECK-NEXT: br label [[STRLEN_JOIN]] +// CHECK: strlen.join: +// CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7 +// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288 +// CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 36 +// CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]]) +// CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK: end.block: +// CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32 +// CHECK-NEXT: ret i32 [[PRINTF_RESULT]] +// CHECK: argpush.block: +// CHECK-NEXT: [[TMP18:%.*]] = shl i32 [[TMP15]], 2 +// CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], 2 +// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK-NEXT: store i64 7257695813269076350, ptr addrspace(1) [[TMP20]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 8 +// CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP21]], ptr align 1 [[TMP0]], i64 [[TMP11]], i1 false) +// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i64 [[TMP13]] +// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @_ZZ4foo2vE5shval to ptr), ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8 +// CHECK-NEXT: store ptr [[LCVAL_ASCAST]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8 +// CHECK-NEXT: br label [[END_BLOCK]] +// +__device__ int foo2() { + __shared__ int shval; + int lcval; + return printf("%s %p %p %p\n", dstr, dstr, &shval, &lcval); +} + +__device__ unsigned short g = 30; +__device__ unsigned long n = 30; + +__device__ float f1 = 3.14f; +__device__ double f2 = 2.71828; +__device__ _Float16 f3 = 2.71; +__device__ __bf16 f4 = 3.142; +__device__ _BitInt(55) Int55 = 31; +__device__ _BitInt(44) Int44 = 312; +__device__ _BitInt(128) Int128 = 45637; + +// CHECK-LABEL: define dso_local noundef i32 @_Z4foo3v +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: store i32 25, ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspacecast (ptr addrspace(1) @g to ptr), align 2 +// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @n to ptr), align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @f1 to ptr), align 4 +// CHECK-NEXT: [[CONV1:%.*]] = fpext float [[TMP3]] to double +// CHECK-NEXT: [[TMP4:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @f2 to ptr), align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr addrspacecast (ptr addrspace(1) @f3 to ptr), align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load bfloat, ptr addrspacecast (ptr addrspace(1) @f4 to ptr), align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load i55, ptr addrspacecast (ptr addrspace(1) @Int55 to ptr), align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i44, ptr addrspacecast (ptr addrspace(1) @Int44 to ptr), align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load i128, ptr addrspacecast (ptr addrspace(1) @Int128 to ptr), align 8 +// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 108) +// CHECK-NEXT: [[TMP10:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK-NEXT: br i1 [[TMP10]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK: end.block: +// CHECK-NEXT: [[TMP11:%.*]] = xor i1 [[TMP10]], true +// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP11]] to i32 +// CHECK-NEXT: ret i32 [[PRINTF_RESULT]] +// CHECK: argpush.block: +// CHECK-NEXT: store i32 434, ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK-NEXT: store i64 7271852820361268873, ptr addrspace(1) [[TMP12]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP12]], i32 8 +// CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP0]] to i64 +// CHECK-NEXT: store i64 [[TMP14]], ptr addrspace(1) [[TMP13]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i32 8 +// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[CONV]] to i64 +// CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8 +// CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8 +// CHECK-NEXT: store double [[CONV1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 8 +// CHECK-NEXT: store double [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8 +// CHECK-NEXT: [[TMP16:%.*]] = fpext half [[TMP5]] to double +// CHECK-NEXT: store double [[TMP16]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8 +// CHECK-NEXT: [[TMP17:%.*]] = fpext bfloat [[TMP6]] to double +// CHECK-NEXT: store double [[TMP17]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8 +// CHECK-NEXT: [[TMP18:%.*]] = zext i55 [[TMP7]] to i64 +// CHECK-NEXT: store i64 [[TMP18]], ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], i32 8 +// CHECK-NEXT: [[TMP19:%.*]] = zext i44 [[TMP8]] to i64 +// CHECK-NEXT: store i64 [[TMP19]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], i32 8 +// CHECK-NEXT: store i128 [[TMP9]], ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], i32 16 +// CHECK-NEXT: br label [[END_BLOCK]] +// +__device__ int foo3() { + __shared__ int s; + s = 25; + return printf("Random values: %d,%p,%hd,%ld,%f,%f,%f,%f,%d,%d,%d\n",s, &s, g, n, f1, f2, f3, f4, Int55, Int44, Int128); +} + +//A non trivial case, +// CHECK-LABEL: define dso_local noundef i32 @_Z4foo4v +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr +// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str.4 to ptr), ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null +// CHECK-NEXT: br i1 [[TMP1]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK: strlen.while: +// CHECK-NEXT: [[TMP2:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[STRLEN_WHILE]] ] +// CHECK-NEXT: [[TMP3]] = getelementptr i8, ptr [[TMP2]], i64 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 +// CHECK-NEXT: br i1 [[TMP5]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK: strlen.while.done: +// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP0]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP2]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]] +// CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 1 +// CHECK-NEXT: br label [[STRLEN_JOIN]] +// CHECK: strlen.join: +// CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 7 +// CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP11]], 4294967288 +// CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 12 +// CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 +// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP14]]) +// CHECK-NEXT: [[TMP15:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK-NEXT: br i1 [[TMP15]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK: end.block: +// CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP16]] to i32 +// CHECK-NEXT: ret i32 [[PRINTF_RESULT]] +// CHECK: argpush.block: +// CHECK-NEXT: [[TMP17:%.*]] = shl i32 [[TMP14]], 2 +// CHECK-NEXT: store i32 [[TMP17]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP18]], ptr align 1 [[TMP0]], i64 [[TMP10]], i1 false) +// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP18]], i64 [[TMP12]] +// CHECK-NEXT: store i64 10, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK-NEXT: br label [[END_BLOCK]] +// +__device__ int foo4() { + const char* s = "format str%d"; + return printf(s, 10); +} diff --git a/clang/test/CodeGenHIP/sanitize-undefined-null.hip b/clang/test/CodeGenHIP/sanitize-undefined-null.hip --- a/clang/test/CodeGenHIP/sanitize-undefined-null.hip +++ b/clang/test/CodeGenHIP/sanitize-undefined-null.hip @@ -20,12 +20,12 @@ // CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr // CHECK-NEXT: store ptr [[P:%.*]], ptr [[P_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize !3 -// CHECK-NEXT: br i1 [[TMP1]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF4:![0-9]+]], !nosanitize !3 +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize !4 +// CHECK-NEXT: br i1 [[TMP1]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF4:![0-9]+]], !nosanitize !4 // CHECK: handler.type_mismatch: -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize !3 -// CHECK-NEXT: call void @__ubsan_handle_type_mismatch_v1_abort(ptr addrspace(1) @[[GLOB1:[0-9]+]], i64 [[TMP2]]) #[[ATTR2:[0-9]+]], !nosanitize !3 -// CHECK-NEXT: unreachable, !nosanitize !3 +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize !4 +// CHECK-NEXT: call void @__ubsan_handle_type_mismatch_v1_abort(ptr addrspace(1) @[[GLOB1:[0-9]+]], i64 [[TMP2]]) #[[ATTR2:[0-9]+]], !nosanitize !4 +// CHECK-NEXT: unreachable, !nosanitize !4 // CHECK: cont: // CHECK-NEXT: store i8 0, ptr [[TMP0]], align 1 // CHECK-NEXT: ret i32 3 diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -21,6 +21,22 @@ // PTH: "-cc1"{{.*}} "-E" {{.*}}"-fgpu-default-stream=per-thread" // PTH: "-cc1"{{.*}} "-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output" +// Check -mprintf-kind=hostcall +// RUN: %clang -### -mprintf-kind=hostcall %s -save-temps 2>&1 | FileCheck -check-prefix=HOSTC %s +// HOSTC: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-mprintf-kind=hostcall" "-Werror=format-invalid-specifier" "-E" {{.*}} +// HOSTC: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=hostcall" "-Werror=format-invalid-specifier" {{.*}}"-x" "hip-cpp-output" +// HOSTC: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=hostcall" "-Werror=format-invalid-specifier" {{.*}}"-x" "ir" +// HOSTC: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-E" {{.*}} +// HOSTC: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "ir" + +// Check -mprintf-kind=buffered +// RUN: %clang -### -mprintf-kind=buffered %s -save-temps 2>&1 | FileCheck -check-prefix=BUFF %s +// BUFF: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-mprintf-kind=buffered" "-Werror=format-invalid-specifier" "-E" {{.*}} +// BUFF: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=buffered" "-Werror=format-invalid-specifier" {{.*}}"-x" "hip-cpp-output" +// BUFF: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=buffered" "-Werror=format-invalid-specifier" {{.*}}"-x" "ir" +// BUFF: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-E" {{.*}} +// BUFF: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "ir" + // RUN: %clang -### -x hip --target=x86_64-pc-windows-msvc -fms-extensions \ // RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \ // RUN: FileCheck -check-prefix=MLLVM %s diff --git a/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h b/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h --- a/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h +++ b/llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h @@ -18,7 +18,8 @@ namespace llvm { -Value *emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef Args); +Value *emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef Args, + bool isBuffered); } // end namespace llvm diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp --- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp +++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp @@ -17,6 +17,9 @@ #include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/DataExtractor.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; @@ -179,11 +182,7 @@ // Scan the format string to locate all specifiers, and mark the ones that // specify a string, i.e, the "%s" specifier with optional '*' characters. -static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) { - StringRef Str; - if (!getConstantStringInfo(Fmt, Str) || Str.empty()) - return; - +static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) { static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn"; size_t SpecPos = 0; // Skip the first argument, the format string. @@ -207,14 +206,322 @@ } } -Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, - ArrayRef Args) { +// helper struct to package the string related data +struct StringData { + StringRef Str; + Value *RealSize = nullptr; + Value *AlignedSize = nullptr; + bool IsConst = true; + + StringData(StringRef ST, Value *RS, Value *AS, bool IC) + : Str(ST), RealSize(RS), AlignedSize(AS), IsConst(IC) {} +}; + +// Calculates frame size required for current printf expansion and allocates +// space on printf buffer. Printf frame includes following contents +// [ ControlDWord , format string/Hash , Arguments (each aligned to 8 byte) ] +static Value *callBufferedPrintfStart( + IRBuilder<> &Builder, ArrayRef Args, Value *Fmt, + bool isConstFmtStr, SparseBitVector<8> &SpecIsCString, + SmallVectorImpl &StringContents, Value *&ArgSize) { + Module *M = Builder.GetInsertBlock()->getModule(); + Value *NonConstStrLen = nullptr; + Value *LenWithNull = nullptr; + Value *LenWithNullAligned = nullptr; + Value *TempAdd = nullptr; + + // First 4 bytes to be reserved for control dword + size_t BufSize = 4; + if (isConstFmtStr) + // First 8 bytes of MD5 hash + BufSize += 8; + else { + LenWithNull = getStrlenWithNull(Builder, Fmt); + + // Align the computed length to next 8 byte boundary + TempAdd = Builder.CreateAdd(LenWithNull, + ConstantInt::get(LenWithNull->getType(), 7U)); + NonConstStrLen = Builder.CreateAnd( + TempAdd, ConstantInt::get(LenWithNull->getType(), ~7U)); + + StringContents.push_back( + StringData(StringRef(), LenWithNull, NonConstStrLen, false)); + } + + for (size_t i = 1; i < Args.size(); i++) { + if (SpecIsCString.test(i)) { + StringRef ArgStr; + if (getConstantStringInfo(Args[i], ArgStr)) { + auto alignedLen = alignTo(ArgStr.size() + 1, 8); + StringContents.push_back(StringData( + ArgStr, + /*RealSize*/ nullptr, /*AlignedSize*/ nullptr, /*IsConst*/ true)); + BufSize += alignedLen; + } else { + LenWithNull = getStrlenWithNull(Builder, Args[i]); + + // Align the computed length to next 8 byte boundary + TempAdd = Builder.CreateAdd( + LenWithNull, ConstantInt::get(LenWithNull->getType(), 7U)); + LenWithNullAligned = Builder.CreateAnd( + TempAdd, ConstantInt::get(LenWithNull->getType(), ~7U)); + + if (NonConstStrLen) { + auto Val = Builder.CreateAdd(LenWithNullAligned, NonConstStrLen, + "cumulativeAdd"); + NonConstStrLen = Val; + } else + NonConstStrLen = LenWithNullAligned; + + StringContents.push_back( + StringData(StringRef(), LenWithNull, LenWithNullAligned, false)); + } + } else { + auto AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType()); + if (AllocSize <= 8) + // We end up expanding non string arguments to 8 bytes + // (args smaller than 8 bytes) + BufSize += 8; + else + BufSize += AllocSize; + } + } + + // calculate final size value to be passed to printf_alloc + Value *SizeToReserve = ConstantInt::get(Builder.getInt64Ty(), BufSize, false); + SmallVector Alloc_args; + if (NonConstStrLen) + SizeToReserve = Builder.CreateAdd(NonConstStrLen, SizeToReserve); + + ArgSize = Builder.CreateTrunc(SizeToReserve, Builder.getInt32Ty()); + Alloc_args.push_back(ArgSize); + + // call the printf_alloc function + AttributeList Attr = AttributeList::get( + Builder.getContext(), AttributeList::FunctionIndex, Attribute::NoUnwind); + + Type *Tys_alloc[1] = {Builder.getInt32Ty()}; + Type *I8Ptr = + Builder.getInt8PtrTy(M->getDataLayout().getDefaultGlobalsAddressSpace()); + FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false); + auto PrintfAllocFn = + M->getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr); + + return Builder.CreateCall(PrintfAllocFn, Alloc_args, "printf_alloc_fn"); +} + +// Prepare constant string argument to push onto the buffer +static void processConstantStringArg(StringData *SD, IRBuilder<> &Builder, + SmallVectorImpl &WhatToStore) { + std::string Str(SD->Str.str() + '\0'); + + DataExtractor Extractor(Str, /*IsLittleEndian=*/true, 8); + DataExtractor::Cursor Offset(0); + while (Offset && Offset.tell() < Str.size()) { + const uint64_t ReadSize = 4; + uint64_t ReadNow = std::min(ReadSize, Str.size() - Offset.tell()); + uint64_t ReadBytes = 0; + switch (ReadNow) { + default: + llvm_unreachable("min(4, X) > 4?"); + case 1: + ReadBytes = Extractor.getU8(Offset); + break; + case 2: + ReadBytes = Extractor.getU16(Offset); + break; + case 3: + ReadBytes = Extractor.getU24(Offset); + break; + case 4: + ReadBytes = Extractor.getU32(Offset); + break; + } + cantFail(Offset.takeError(), "failed to read bytes from constant array"); + + APInt IntVal(8 * ReadSize, ReadBytes); + + // TODO: Should not bother aligning up. + if (ReadNow < ReadSize) + IntVal = IntVal.zext(8 * ReadSize); + + Type *IntTy = Type::getIntNTy(Builder.getContext(), IntVal.getBitWidth()); + WhatToStore.push_back(ConstantInt::get(IntTy, IntVal)); + } + // Additional padding for 8 byte alignment + int Rem = (Str.size() % 8); + if (Rem > 0 && Rem <= 4) + WhatToStore.push_back(ConstantInt::get(Builder.getInt32Ty(), 0)); +} + +static Value *processNonStringArg(Value *Arg, IRBuilder<> &Builder) { + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); + auto Ty = Arg->getType(); + + if (auto IntTy = dyn_cast(Ty)) { + if (IntTy->getBitWidth() < 64) { + return Builder.CreateZExt(Arg, Builder.getInt64Ty()); + } + } + + if (Ty->isFloatingPointTy()) { + if (DL.getTypeAllocSize(Ty) < 8) { + return Builder.CreateFPExt(Arg, Builder.getDoubleTy()); + } + } + + return Arg; +} + +static void +callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef Args, + Value *PtrToStore, SparseBitVector<8> &SpecIsCString, + SmallVectorImpl &StringContents, + bool IsConstFmtStr) { + Module *M = Builder.GetInsertBlock()->getModule(); + const DataLayout &DL = M->getDataLayout(); + auto StrIt = StringContents.begin(); + size_t i = IsConstFmtStr ? 1 : 0; + for (; i < Args.size(); i++) { + SmallVector WhatToStore; + if ((i == 0) || SpecIsCString.test(i)) { + if (StrIt->IsConst) { + processConstantStringArg(StrIt, Builder, WhatToStore); + StrIt++; + } else { + // This copies the contents of the string, however the next offset + // is at aligned length, the extra space that might be created due + // to alignment padding is not populated with any specific value + // here. This would be safe as long as runtime is sync with + // the offsets. + Builder.CreateMemCpy(PtrToStore, /*DstAlign*/ Align(1), Args[i], + /*SrcAlign*/ Args[i]->getPointerAlignment(DL), + StrIt->RealSize); + + PtrToStore = + Builder.CreateInBoundsGEP(Builder.getInt8Ty(), PtrToStore, + {StrIt->AlignedSize}, "PrintBuffNextPtr"); + LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:" + << *PtrToStore << '\n'); + + // done with current argument, move to next + StrIt++; + continue; + } + } else { + WhatToStore.push_back(processNonStringArg(Args[i], Builder)); + } + + for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) { + Value *toStore = WhatToStore[I]; + + StoreInst *StBuff = Builder.CreateStore(toStore, PtrToStore); + LLVM_DEBUG(dbgs() << "inserting store to printf buffer:" << *StBuff + << '\n'); + PtrToStore = Builder.CreateConstInBoundsGEP1_32( + Builder.getInt8Ty(), PtrToStore, + M->getDataLayout().getTypeAllocSize(toStore->getType()), + "PrintBuffNextPtr"); + LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:" << *PtrToStore + << '\n'); + } + } +} + +Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef Args, + bool IsBuffered) { auto NumOps = Args.size(); assert(NumOps >= 1); auto Fmt = Args[0]; SparseBitVector<8> SpecIsCString; - locateCStrings(SpecIsCString, Fmt); + StringRef FmtStr; + + if (getConstantStringInfo(Fmt, FmtStr)) + locateCStrings(SpecIsCString, FmtStr); + + if (IsBuffered) { + SmallVector StringContents; + Module *M = Builder.GetInsertBlock()->getModule(); + LLVMContext &Ctx = Builder.getContext(); + auto Int8Ty = Builder.getInt8Ty(); + auto Int32Ty = Builder.getInt32Ty(); + bool IsConstFmtStr = !FmtStr.empty(); + + Value *ArgSize = nullptr; + Value *Ptr = + callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr, + SpecIsCString, StringContents, ArgSize); + + // The buffered version still follows OpenCL printf standards for + // printf return value, i.e 0 on success, -1 on failure. + ConstantPointerNull *zeroIntPtr = + ConstantPointerNull::get(cast(Ptr->getType())); + + auto *Cmp = cast(Builder.CreateICmpNE(Ptr, zeroIntPtr, "")); + + BasicBlock *End = BasicBlock::Create(Ctx, "end.block", + Builder.GetInsertBlock()->getParent()); + BasicBlock *ArgPush = BasicBlock::Create( + Ctx, "argpush.block", Builder.GetInsertBlock()->getParent()); + + BranchInst::Create(ArgPush, End, Cmp, Builder.GetInsertBlock()); + Builder.SetInsertPoint(ArgPush); + + // Create controlDWord and store as the first entry, format as follows + // Bit 0 (LSB) -> stream (1 if stderr, 0 if stdout, printf always outputs to + // stdout) Bit 1 -> constant format string (1 if constant) Bits 2-31 -> size + // of printf data frame + auto ConstantTwo = Builder.getInt32(2); + auto ControlDWord = Builder.CreateShl(ArgSize, ConstantTwo); + if (IsConstFmtStr) + ControlDWord = Builder.CreateOr(ControlDWord, ConstantTwo); + + Builder.CreateStore(ControlDWord, Ptr); + + Ptr = Builder.CreateConstInBoundsGEP1_32(Int8Ty, Ptr, 4); + + // Create MD5 hash for costant format string, push low 64 bits of the + // same onto buffer and metadata. + NamedMDNode *metaD = M->getOrInsertNamedMetadata("llvm.printf.fmts"); + if (IsConstFmtStr) { + MD5 Hasher; + MD5::MD5Result Hash; + Hasher.update(FmtStr); + Hasher.final(Hash); + + // Try sticking to llvm.printf.fmts format, although we are not going to + // use the ID and argument size fields while printing, + std::string MetadataStr = + "0:0:" + llvm::utohexstr(Hash.low(), /*LowerCase=*/true) + "," + + FmtStr.str(); + MDString *fmtStrArray = MDString::get(Ctx, MetadataStr); + MDNode *myMD = MDNode::get(Ctx, fmtStrArray); + metaD->addOperand(myMD); + + Builder.CreateStore(Builder.getInt64(Hash.low()), Ptr); + Ptr = Builder.CreateConstInBoundsGEP1_32(Int8Ty, Ptr, 8); + } else { + // Include a dummy metadata instance in case of only non constant + // format string usage, This might be an absurd usecase but needs to + // be done for completeness + if (metaD->getNumOperands() == 0) { + MDString *fmtStrArray = + MDString::get(Ctx, "0:0:ffffffff,\"Non const format string\""); + MDNode *myMD = MDNode::get(Ctx, fmtStrArray); + metaD->addOperand(myMD); + } + } + + // Push The printf arguments onto buffer + callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, StringContents, + IsConstFmtStr); + + // End block, returns -1 on failure + BranchInst::Create(End, ArgPush); + Builder.SetInsertPoint(End); + return Builder.CreateSExt(Builder.CreateNot(Cmp), Int32Ty, "printf_result"); + } auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0)); Desc = appendString(Builder, Desc, Fmt, NumOps == 1);