diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5106,11 +5106,16 @@ return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy())); } case Builtin::BIprintf: - if (getTarget().getTriple().isNVPTX()) - return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue); - if (getTarget().getTriple().getArch() == Triple::amdgcn && - getLangOpts().HIP) - return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue); + if (getTarget().getTriple().isNVPTX() || + getTarget().getTriple().isAMDGCN()) { + if (getLangOpts().OpenMPIsDevice) + return EmitOpenMPDevicePrintfCallExpr(E); + if (getTarget().getTriple().isNVPTX()) + return EmitNVPTXDevicePrintfCallExpr(E); + if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP) + return EmitAMDGPUDevicePrintfCallExpr(E); + } + break; case Builtin::BI__builtin_canonicalize: case Builtin::BI__builtin_canonicalizef: diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -21,13 +21,14 @@ using namespace clang; using namespace CodeGen; -static llvm::Function *GetVprintfDeclaration(llvm::Module &M) { +namespace { +llvm::Function *GetVprintfDeclaration(llvm::Module &M) { llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()), llvm::Type::getInt8PtrTy(M.getContext())}; llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get( llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false); - if (auto* F = M.getFunction("vprintf")) { + if (auto *F = M.getFunction("vprintf")) { // Our CUDA system header declares vprintf with the right signature, so // nobody else should have been able to declare vprintf with a bogus // signature. @@ -41,6 +42,28 @@ VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M); } +llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) { + const char *Name = "__llvm_omp_vprintf"; + llvm::Module &M = CGM.getModule(); + llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()), + llvm::Type::getInt8PtrTy(M.getContext()), + llvm::Type::getInt32Ty(M.getContext())}; + llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get( + llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false); + + if (auto *F = M.getFunction(Name)) { + if (F->getFunctionType() != VprintfFuncType) { + CGM.Error(SourceLocation(), + "Invalid type declaration for __llvm_omp_vprintf"); + return nullptr; + } + return F; + } + + return llvm::Function::Create( + VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M); +} + // Transforms a call to printf into a call to the NVPTX vprintf syscall (which // isn't particularly special; it's invoked just like a regular function). // vprintf takes two args: A format string, and a pointer to a buffer containing @@ -67,17 +90,17 @@ // Note that by the time this function runs, E's args have already undergone the // standard C vararg promotion (short -> int, float -> double, etc.). -namespace { -llvm::Value *packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, - const CallArgList &Args) { +std::pair +packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) { const llvm::DataLayout &DL = CGF->CGM.getDataLayout(); llvm::LLVMContext &Ctx = CGF->CGM.getLLVMContext(); CGBuilderTy &Builder = CGF->Builder; // Construct and fill the args buffer that we'll pass to vprintf. if (Args.size() <= 1) { - // If there are no args, pass a null pointer to vprintf. - return llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); + // If there are no args, pass a null pointer and size 0 + llvm::Value * BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); + return {BufferPtr, llvm::TypeSize::Fixed(0)}; } else { llvm::SmallVector ArgTypes; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) @@ -96,43 +119,64 @@ llvm::Value *Arg = Args[I].getRValue(*CGF).getScalarVal(); Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType())); } - return Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx)); + llvm::Value *BufferPtr = + Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx)); + return {BufferPtr, DL.getTypeAllocSize(AllocaTy)}; } } -} // namespace -RValue -CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E, - ReturnValueSlot ReturnValue) { - assert(getTarget().getTriple().isNVPTX()); +bool containsNonScalarVarargs(CodeGenFunction *CGF, CallArgList Args) { + return llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) { + return !A.getRValue(*CGF).isScalar(); + }); +} + +RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, + llvm::Function *Decl, bool WithSizeArg) { + CodeGenModule &CGM = CGF->CGM; + CGBuilderTy &Builder = CGF->Builder; assert(E->getBuiltinCallee() == Builtin::BIprintf); assert(E->getNumArgs() >= 1); // printf always has at least one arg. + // Uses the same format as nvptx for the argument packing, but also passes + // an i32 for the total size of the passed pointer CallArgList Args; - EmitCallArgs(Args, - E->getDirectCallee()->getType()->getAs(), - E->arguments(), E->getDirectCallee(), - /* ParamsToSkip = */ 0); + CGF->EmitCallArgs(Args, + E->getDirectCallee()->getType()->getAs(), + E->arguments(), E->getDirectCallee(), + /* ParamsToSkip = */ 0); // We don't know how to emit non-scalar varargs. - if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) { - return !A.getRValue(*this).isScalar(); - })) { + if (containsNonScalarVarargs(CGF, Args)) { CGM.ErrorUnsupported(E, "non-scalar arg to printf"); - return RValue::get(llvm::ConstantInt::get(IntTy, 0)); + return RValue::get(llvm::ConstantInt::get(CGF->IntTy, 0)); } - llvm::Value *BufferPtr = packArgsIntoNVPTXFormatBuffer(this, Args); + auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args); + llvm::Value *BufferPtr = r.first; + + llvm::SmallVector Vec = { + Args[0].getRValue(*CGF).getScalarVal(), BufferPtr}; + if (WithSizeArg) { + // Passing > 32bit of data as a local alloca doesn't work for nvptx or + // amdgpu + llvm::Constant *Size = + llvm::ConstantInt::get(llvm::Type::getInt32Ty(CGM.getLLVMContext()), + static_cast(r.second.getFixedSize())); - // Invoke vprintf and return. - llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule()); - return RValue::get(Builder.CreateCall( - VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr})); + Vec.push_back(Size); + } + return RValue::get(Builder.CreateCall(Decl, Vec)); } +} // namespace -RValue -CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E, - ReturnValueSlot ReturnValue) { +RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) { + assert(getTarget().getTriple().isNVPTX()); + return EmitDevicePrintfCallExpr( + E, this, GetVprintfDeclaration(CGM.getModule()), false); +} + +RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) { assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn); assert(E->getBuiltinCallee() == Builtin::BIprintf || E->getBuiltinCallee() == Builtin::BI__builtin_printf); @@ -162,3 +206,10 @@ Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint()); return RValue::get(Printf); } + +RValue CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E) { + assert(getTarget().getTriple().isNVPTX() || + getTarget().getTriple().isAMDGCN()); + return EmitDevicePrintfCallExpr(E, this, GetOpenMPVprintfDeclaration(CGM), + true); +} diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4070,10 +4070,9 @@ RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue); - RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E, - ReturnValueSlot ReturnValue); - RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E, - ReturnValueSlot ReturnValue); + RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E); + RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E); + RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E); RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue); diff --git a/clang/test/OpenMP/nvptx_target_printf_codegen.c b/clang/test/OpenMP/nvptx_target_printf_codegen.c --- a/clang/test/OpenMP/nvptx_target_printf_codegen.c +++ b/clang/test/OpenMP/nvptx_target_printf_codegen.c @@ -43,136 +43,496 @@ // // // -// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13 +// +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_worker // CHECK-64-SAME: () #[[ATTR0:[0-9]+]] { // CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-64-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK-64-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK-64: .await.work: +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3:[0-9]+]] +// CHECK-64-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]], i16 1) +// CHECK-64-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-64-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-64-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-64-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK-64: .select.workers: +// CHECK-64-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-64-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK-64: .execute.parallel: +// CHECK-64-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB0:[0-9]+]]) +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-64-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK-64-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK-64: .terminate.parallel: +// CHECK-64-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-64-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK-64: .barrier.parallel: +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK-64: .exit: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13 +// CHECK-64-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK-64-NEXT: entry: // CHECK-64-NEXT: [[FMT:%.*]] = alloca i8*, align 8 -// CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8 -// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) -// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -// CHECK-64: user_code.entry: +// CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]] +// CHECK-64-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] +// CHECK-64-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-64-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-64-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK-64: .worker: +// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_worker() #[[ATTR4:[0-9]+]] +// CHECK-64-NEXT: br label [[DOTEXIT:%.*]] +// CHECK-64: .mastercheck: +// CHECK-64-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-64-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-64-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-64-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-64-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK-64: .master: +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-64-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-64-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK-64-NEXT: store i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i8** [[FMT]], align 8 -// CHECK-64-NEXT: [[TMP1:%.*]] = load i8*, i8** [[FMT]], align 8 -// CHECK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 0 -// CHECK-64-NEXT: store i32 1, i32* [[TMP2]], align 4 -// CHECK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 1 -// CHECK-64-NEXT: store i64 2, i64* [[TMP3]], align 8 -// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 2 -// CHECK-64-NEXT: store double 3.000000e+00, double* [[TMP4]], align 8 -// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast %printf_args* [[TMP]] to i8* -// CHECK-64-NEXT: [[TMP6:%.*]] = call i32 @vprintf(i8* [[TMP1]], i8* [[TMP5]]) -// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +// CHECK-64-NEXT: [[TMP5:%.*]] = load i8*, i8** [[FMT]], align 8 +// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 0 +// CHECK-64-NEXT: store i32 1, i32* [[TMP6]], align 4 +// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 1 +// CHECK-64-NEXT: store i64 2, i64* [[TMP7]], align 8 +// CHECK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 2 +// CHECK-64-NEXT: store double 3.000000e+00, double* [[TMP8]], align 8 +// CHECK-64-NEXT: [[TMP9:%.*]] = bitcast %printf_args* [[TMP]] to i8* +// CHECK-64-NEXT: [[TMP10:%.*]] = call i32 @vprintf(i8* [[TMP5]], i8* [[TMP9]]) +// CHECK-64-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK-64: .termination.notifier: +// CHECK-64-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: br label [[DOTEXIT]] +// CHECK-64: .exit: // CHECK-64-NEXT: ret void -// CHECK-64: worker.exit: +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_worker +// CHECK-64-SAME: () #[[ATTR0]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-64-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK-64-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK-64: .await.work: +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]], i16 1) +// CHECK-64-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-64-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-64-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-64-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK-64: .select.workers: +// CHECK-64-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-64-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK-64: .execute.parallel: +// CHECK-64-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB0]]) +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-64-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK-64-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK-64: .terminate.parallel: +// CHECK-64-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-64-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK-64: .barrier.parallel: +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK-64: .exit: // CHECK-64-NEXT: ret void // // // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25 -// CHECK-64-SAME: () #[[ATTR0]] { +// CHECK-64-SAME: () #[[ATTR1]] { // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) -// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -// CHECK-64: user_code.entry: -// CHECK-64-NEXT: [[TMP1:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i8* null) -// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +// CHECK-64-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-64-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-64-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK-64: .worker: +// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_worker() #[[ATTR4]] +// CHECK-64-NEXT: br label [[DOTEXIT:%.*]] +// CHECK-64: .mastercheck: +// CHECK-64-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-64-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-64-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-64-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-64-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK-64: .master: +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-64-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-64-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK-64-NEXT: [[TMP5:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str2, i64 0, i64 0), i8* null) +// CHECK-64-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK-64: .termination.notifier: +// CHECK-64-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: br label [[DOTEXIT]] +// CHECK-64: .exit: // CHECK-64-NEXT: ret void -// CHECK-64: worker.exit: +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_worker +// CHECK-64-SAME: () #[[ATTR0]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-64-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK-64-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK-64: .await.work: +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]], i16 1) +// CHECK-64-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-64-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-64-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-64-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK-64: .select.workers: +// CHECK-64-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-64-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-64-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK-64: .execute.parallel: +// CHECK-64-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB0]]) +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-64-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK-64-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK-64: .terminate.parallel: +// CHECK-64-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-64-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK-64: .barrier.parallel: +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK-64: .exit: // CHECK-64-NEXT: ret void // // // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36 -// CHECK-64-SAME: (i64 [[FOO:%.*]]) #[[ATTR0]] { +// CHECK-64-SAME: (i64 [[FOO:%.*]]) #[[ATTR1]] { // CHECK-64-NEXT: entry: // CHECK-64-NEXT: [[FOO_ADDR:%.*]] = alloca i64, align 8 -// CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8 +// CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]] // CHECK-64-NEXT: store i64 [[FOO]], i64* [[FOO_ADDR]], align 8 // CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[FOO_ADDR]] to i32* -// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) -// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -// CHECK-64: user_code.entry: -// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK-64-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0 +// CHECK-64-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-64-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-64-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK-64: .worker: +// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_worker() #[[ATTR4]] +// CHECK-64-NEXT: br label [[DOTEXIT:%.*]] +// CHECK-64: .mastercheck: +// CHECK-64-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-64-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-64-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-64-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-64-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK-64: .master: +// CHECK-64-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-64-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-64-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-64-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-64-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP5]], 0 // CHECK-64-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] // CHECK-64: if.then: -// CHECK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], %printf_args.0* [[TMP]], i32 0, i32 0 -// CHECK-64-NEXT: store i32 42, i32* [[TMP2]], align 4 -// CHECK-64-NEXT: [[TMP3:%.*]] = bitcast %printf_args.0* [[TMP]] to i8* -// CHECK-64-NEXT: [[TMP4:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str2, i64 0, i64 0), i8* [[TMP3]]) +// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], %printf_args.0* [[TMP]], i32 0, i32 0 +// CHECK-64-NEXT: store i32 42, i32* [[TMP6]], align 4 +// CHECK-64-NEXT: [[TMP7:%.*]] = bitcast %printf_args.0* [[TMP]] to i8* +// CHECK-64-NEXT: [[TMP8:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str3, i64 0, i64 0), i8* [[TMP7]]) // CHECK-64-NEXT: br label [[IF_END]] -// CHECK-64: worker.exit: -// CHECK-64-NEXT: ret void // CHECK-64: if.end: -// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +// CHECK-64-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK-64: .termination.notifier: +// CHECK-64-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-64-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-64-NEXT: br label [[DOTEXIT]] +// CHECK-64: .exit: // CHECK-64-NEXT: ret void // // // // // -// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13 +// +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_worker // CHECK-32-SAME: () #[[ATTR0:[0-9]+]] { // CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK-32: .await.work: +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3:[0-9]+]] +// CHECK-32-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]], i16 1) +// CHECK-32-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-32-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK-32-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-32-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK-32: .select.workers: +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-32-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK-32: .execute.parallel: +// CHECK-32-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB0:[0-9]+]]) +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-32-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK-32-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK-32: .terminate.parallel: +// CHECK-32-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-32-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK-32: .barrier.parallel: +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK-32: .exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13 +// CHECK-32-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK-32-NEXT: entry: // CHECK-32-NEXT: [[FMT:%.*]] = alloca i8*, align 4 -// CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8 -// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) -// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]] +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] +// CHECK-32-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-32-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-32-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK-32: .worker: +// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_worker() #[[ATTR4:[0-9]+]] +// CHECK-32-NEXT: br label [[DOTEXIT:%.*]] +// CHECK-32: .mastercheck: +// CHECK-32-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-32-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-32-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-32-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-32-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK-32: .master: +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-32-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-32-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK-32-NEXT: store i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8** [[FMT]], align 4 -// CHECK-32-NEXT: [[TMP1:%.*]] = load i8*, i8** [[FMT]], align 4 -// CHECK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 0 -// CHECK-32-NEXT: store i32 1, i32* [[TMP2]], align 4 -// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 1 -// CHECK-32-NEXT: store i64 2, i64* [[TMP3]], align 8 -// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 2 -// CHECK-32-NEXT: store double 3.000000e+00, double* [[TMP4]], align 8 -// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast %printf_args* [[TMP]] to i8* -// CHECK-32-NEXT: [[TMP6:%.*]] = call i32 @vprintf(i8* [[TMP1]], i8* [[TMP5]]) -// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +// CHECK-32-NEXT: [[TMP5:%.*]] = load i8*, i8** [[FMT]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 0 +// CHECK-32-NEXT: store i32 1, i32* [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 1 +// CHECK-32-NEXT: store i64 2, i64* [[TMP7]], align 8 +// CHECK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 2 +// CHECK-32-NEXT: store double 3.000000e+00, double* [[TMP8]], align 8 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast %printf_args* [[TMP]] to i8* +// CHECK-32-NEXT: [[TMP10:%.*]] = call i32 @vprintf(i8* [[TMP5]], i8* [[TMP9]]) +// CHECK-32-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK-32: .termination.notifier: +// CHECK-32-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: br label [[DOTEXIT]] +// CHECK-32: .exit: // CHECK-32-NEXT: ret void -// CHECK-32: worker.exit: +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_worker +// CHECK-32-SAME: () #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK-32: .await.work: +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]], i16 1) +// CHECK-32-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-32-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK-32-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-32-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK-32: .select.workers: +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-32-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK-32: .execute.parallel: +// CHECK-32-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB0]]) +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-32-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK-32-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK-32: .terminate.parallel: +// CHECK-32-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-32-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK-32: .barrier.parallel: +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK-32: .exit: // CHECK-32-NEXT: ret void // // // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25 -// CHECK-32-SAME: () #[[ATTR0]] { +// CHECK-32-SAME: () #[[ATTR1]] { // CHECK-32-NEXT: entry: -// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) -// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -// CHECK-32: user_code.entry: -// CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i32 0, i32 0), i8* null) -// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-32-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-32-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK-32: .worker: +// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_worker() #[[ATTR4]] +// CHECK-32-NEXT: br label [[DOTEXIT:%.*]] +// CHECK-32: .mastercheck: +// CHECK-32-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-32-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-32-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-32-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-32-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK-32: .master: +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-32-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-32-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK-32-NEXT: [[TMP5:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str2, i32 0, i32 0), i8* null) +// CHECK-32-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK-32: .termination.notifier: +// CHECK-32-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: br label [[DOTEXIT]] +// CHECK-32: .exit: // CHECK-32-NEXT: ret void -// CHECK-32: worker.exit: +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_worker +// CHECK-32-SAME: () #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK-32: .await.work: +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]], i16 1) +// CHECK-32-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-32-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK-32-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-32-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK-32: .select.workers: +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-32-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-32-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK-32: .execute.parallel: +// CHECK-32-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB0]]) +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-32-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK-32-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK-32: .terminate.parallel: +// CHECK-32-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-32-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK-32: .barrier.parallel: +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK-32: .exit: // CHECK-32-NEXT: ret void // // // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36 -// CHECK-32-SAME: (i32 [[FOO:%.*]]) #[[ATTR0]] { +// CHECK-32-SAME: (i32 [[FOO:%.*]]) #[[ATTR1]] { // CHECK-32-NEXT: entry: // CHECK-32-NEXT: [[FOO_ADDR:%.*]] = alloca i32, align 4 -// CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8 +// CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]] // CHECK-32-NEXT: store i32 [[FOO]], i32* [[FOO_ADDR]], align 4 -// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) -// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -// CHECK-32: user_code.entry: -// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[FOO_ADDR]], align 4 -// CHECK-32-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-32-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-32-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK-32: .worker: +// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_worker() #[[ATTR4]] +// CHECK-32-NEXT: br label [[DOTEXIT:%.*]] +// CHECK-32: .mastercheck: +// CHECK-32-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-32-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-32-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-32-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-32-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK-32: .master: +// CHECK-32-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK-32-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-32-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-32-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[FOO_ADDR]], align 4 +// CHECK-32-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP5]], 0 // CHECK-32-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] // CHECK-32: if.then: -// CHECK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], %printf_args.0* [[TMP]], i32 0, i32 0 -// CHECK-32-NEXT: store i32 42, i32* [[TMP2]], align 4 -// CHECK-32-NEXT: [[TMP3:%.*]] = bitcast %printf_args.0* [[TMP]] to i8* -// CHECK-32-NEXT: [[TMP4:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str2, i32 0, i32 0), i8* [[TMP3]]) +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], %printf_args.0* [[TMP]], i32 0, i32 0 +// CHECK-32-NEXT: store i32 42, i32* [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast %printf_args.0* [[TMP]] to i8* +// CHECK-32-NEXT: [[TMP8:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str3, i32 0, i32 0), i8* [[TMP7]]) // CHECK-32-NEXT: br label [[IF_END]] -// CHECK-32: worker.exit: -// CHECK-32-NEXT: ret void // CHECK-32: if.end: -// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +// CHECK-32-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK-32: .termination.notifier: +// CHECK-32-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-32-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #[[ATTR3]] +// CHECK-32-NEXT: br label [[DOTEXIT]] +// CHECK-32: .exit: // CHECK-32-NEXT: ret void // diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h --- a/openmp/libomptarget/DeviceRTL/include/Debug.h +++ b/openmp/libomptarget/DeviceRTL/include/Debug.h @@ -34,23 +34,15 @@ ///} /// Print -/// TODO: For now we have to use macros to guard the code because Clang lowers -/// `printf` to different function calls on NVPTX and AMDGCN platforms, and it -/// doesn't work for AMDGCN. After it can work on AMDGCN, we will remove the -/// macro. +/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf /// { -#ifndef __AMDGCN__ extern "C" { int printf(const char *format, ...); } -#define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__); +#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__); #define PRINT(str) PRINTF("%s", str) -#else -#define PRINTF(fmt, ...) -#define PRINT(str) -#endif ///} diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -29,6 +29,29 @@ assertion); __builtin_trap(); } + +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) +int32_t vprintf(const char *, void *); +namespace impl { +static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return vprintf(Format, Arguments); +} +} // namespace impl +#pragma omp end declare variant + +// We do not have a vprintf implementation for AMD GPU yet so we use a stub. +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +namespace impl { +static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return -1; +} +} // namespace impl +#pragma omp end declare variant + +int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) { + return impl::omp_vprintf(Format, Arguments, Size); +} } /// Current indentation level for the function trace. Only accessed by thread 0. diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -184,6 +184,11 @@ } __attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {} +EXTERN +int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return -1; +} + EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -184,9 +184,15 @@ extern "C" { void *malloc(size_t); void free(void *); +int32_t vprintf(const char *, void *); } EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); } EXTERN void __kmpc_impl_free(void *x) { free(x); } +EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, + uint32_t) { + return vprintf(Format, Arguments); +} + #pragma omp end declare target diff --git a/openmp/libomptarget/test/mapping/data_member_ref.cpp b/openmp/libomptarget/test/mapping/data_member_ref.cpp --- a/openmp/libomptarget/test/mapping/data_member_ref.cpp +++ b/openmp/libomptarget/test/mapping/data_member_ref.cpp @@ -1,6 +1,6 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// amdgcn does not have printf definition +// Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp --- a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp @@ -1,6 +1,6 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// amdgcn does not have printf definition +// Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp --- a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp @@ -1,6 +1,6 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// amdgcn does not have printf definition +// Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/mapping/lambda_by_value.cpp b/openmp/libomptarget/test/mapping/lambda_by_value.cpp --- a/openmp/libomptarget/test/mapping/lambda_by_value.cpp +++ b/openmp/libomptarget/test/mapping/lambda_by_value.cpp @@ -1,6 +1,6 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// amdgcn does not have printf definition +// Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/mapping/ompx_hold/struct.c b/openmp/libomptarget/test/mapping/ompx_hold/struct.c --- a/openmp/libomptarget/test/mapping/ompx_hold/struct.c +++ b/openmp/libomptarget/test/mapping/ompx_hold/struct.c @@ -1,7 +1,7 @@ // RUN: %libomptarget-compile-generic -fopenmp-extensions // RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace -// amdgcn does not have printf definition +// Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/mapping/ptr_and_obj_motion.c b/openmp/libomptarget/test/mapping/ptr_and_obj_motion.c --- a/openmp/libomptarget/test/mapping/ptr_and_obj_motion.c +++ b/openmp/libomptarget/test/mapping/ptr_and_obj_motion.c @@ -1,9 +1,5 @@ // RUN: %libomptarget-compile-run-and-check-generic -// amdgcn does not have printf definition -// XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL - #include typedef struct { diff --git a/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp b/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp --- a/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp +++ b/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp @@ -1,9 +1,5 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// amdgcn does not have printf definition -// UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL - #include void sum(int* input, int size, int* output) diff --git a/openmp/libomptarget/test/offloading/bug49021.cpp b/openmp/libomptarget/test/offloading/bug49021.cpp --- a/openmp/libomptarget/test/offloading/bug49021.cpp +++ b/openmp/libomptarget/test/offloading/bug49021.cpp @@ -1,8 +1,7 @@ // RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic -// Wrong results on amdgcn -// UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// Wrong results on amdgpu +// XFAIL: amdgcn-amd-amdhsa #include diff --git a/openmp/libomptarget/test/offloading/bug50022.cpp b/openmp/libomptarget/test/offloading/bug50022.cpp --- a/openmp/libomptarget/test/offloading/bug50022.cpp +++ b/openmp/libomptarget/test/offloading/bug50022.cpp @@ -1,8 +1,5 @@ // RUN: %libomptarget-compilexx-and-run-generic -// UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL - #include #include #include diff --git a/openmp/libomptarget/test/offloading/host_as_target.c b/openmp/libomptarget/test/offloading/host_as_target.c --- a/openmp/libomptarget/test/offloading/host_as_target.c +++ b/openmp/libomptarget/test/offloading/host_as_target.c @@ -7,7 +7,7 @@ // RUN: %libomptarget-compile-run-and-check-generic -// amdgcn does not have printf definition +// amdgpu does not have a working printf definition // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/unified_shared_memory/api.c b/openmp/libomptarget/test/unified_shared_memory/api.c --- a/openmp/libomptarget/test/unified_shared_memory/api.c +++ b/openmp/libomptarget/test/unified_shared_memory/api.c @@ -2,7 +2,7 @@ // XFAIL: nvptx64-nvidia-cuda // XFAIL: nvptx64-nvidia-cuda-newRTL -// Fails on amdgcn with error: GPU Memory Error +// Fails on amdgpu with error: GPU Memory Error // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c --- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c @@ -3,7 +3,7 @@ // REQUIRES: unified_shared_memory // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 -// Fails on amdgcn with error: GPU Memory Error +// Fails on amdgpu with error: GPU Memory Error // XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newRTL diff --git a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c --- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c @@ -3,9 +3,9 @@ // REQUIRES: unified_shared_memory // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 -// amdgcn does not have printf definition -// XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// amdgpu runtime crash +// UNSUPPORTED: amdgcn-amd-amdhsa + #include #include diff --git a/openmp/libomptarget/test/unified_shared_memory/shared_update.c b/openmp/libomptarget/test/unified_shared_memory/shared_update.c --- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c +++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c @@ -2,9 +2,8 @@ // REQUIRES: unified_shared_memory -// amdgcn does not have printf definition -// XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// amdgpu runtime crash +// UNSUPPORTED: amdgcn-amd-amdhsa #include #include