diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -247,6 +247,7 @@ LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") +LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2477,6 +2477,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag>; +def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group, + Flags<[CC1Option, NoArgumentUnused]>, + HelpText<"Do not create a host fallback if offloading to the device fails.">, + MarshallingInfoFlag>; defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime", LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue, PosFlag, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6538,6 +6538,8 @@ // mangled name of the function that encloses the target region and BB is the // line number of the target region. + const bool BuildOutlinedFn = CGM.getLangOpts().OpenMPIsDevice || + !CGM.getLangOpts().OpenMPOffloadMandatory; unsigned DeviceID; unsigned FileID; unsigned Line; @@ -6556,7 +6558,8 @@ CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); - OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc()); + if (BuildOutlinedFn) + OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc()); // If this target outline function is not an offload entry, we don't need to // register it. @@ -6588,9 +6591,20 @@ llvm::Constant::getNullValue(CGM.Int8Ty), Name); } + // If we do not allow host fallback we still need a named address to use. + llvm::Constant *TargetRegionEntryAddr = OutlinedFn; + if (!BuildOutlinedFn) { + assert(!CGM.getModule().getGlobalVariable(EntryFnName, true) && + "Named kernel already exists?"); + TargetRegionEntryAddr = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, + llvm::Constant::getNullValue(CGM.Int8Ty), EntryFnName); + } + // Register the information for the entry associated with this target region. OffloadEntriesInfoManager.registerTargetRegionEntryInfo( - DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID, + DeviceID, FileID, ParentName, Line, TargetRegionEntryAddr, OutlinedFnID, OffloadEntriesInfoManagerTy::OMPTargetRegionEntryTargetRegion); // Add NumTeams and ThreadLimit attributes to the outlined GPU function @@ -6607,7 +6621,8 @@ std::to_string(DefaultValThreads)); } - CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM); + if (BuildOutlinedFn) + CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM); } /// Checks if the expression is constant or does not have non-trivial function @@ -10324,7 +10339,10 @@ if (!CGF.HaveInsertPoint()) return; - assert(OutlinedFn && "Invalid outlined function!"); + const bool OffloadingMandatory = !CGM.getLangOpts().OpenMPIsDevice && + CGM.getLangOpts().OpenMPOffloadMandatory; + + assert((OffloadingMandatory || OutlinedFn) && "Invalid outlined function!"); const bool RequiresOuterTask = D.hasClausesOfKind() || D.hasClausesOfKind(); @@ -10339,18 +10357,28 @@ CodeGenFunction::OMPTargetDataInfo InputInfo; llvm::Value *MapTypesArray = nullptr; llvm::Value *MapNamesArray = nullptr; - // Fill up the pointer arrays and transfer execution to the device. - auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, - &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask, - &CapturedVars, - SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { - if (Device.getInt() == OMPC_DEVICE_ancestor) { - // Reverse offloading is not supported, so just execute on the host. + // Generate code for the host fallback function. + auto &&FallbackGen = [this, OutlinedFn, OutlinedFnID, &D, &CapturedVars, + RequiresOuterTask, &CS, + OffloadingMandatory](CodeGenFunction &CGF) { + if (OffloadingMandatory) { + CGF.Builder.CreateUnreachable(); + } else { if (RequiresOuterTask) { CapturedVars.clear(); CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); } emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + } + }; + // Fill up the pointer arrays and transfer execution to the device. + auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, + &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask, + &CapturedVars, SizeEmitter, + FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) { + if (Device.getInt() == OMPC_DEVICE_ancestor) { + // Reverse offloading is not supported, so just execute on the host. + FallbackGen(CGF); return; } @@ -10494,25 +10522,17 @@ CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); CGF.EmitBlock(OffloadFailedBlock); - if (RequiresOuterTask) { - CapturedVars.clear(); - CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); - } - emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + FallbackGen(CGF); + CGF.EmitBranch(OffloadContBlock); CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true); }; // Notify that the host version must be executed. - auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, - RequiresOuterTask](CodeGenFunction &CGF, - PrePostActionTy &) { - if (RequiresOuterTask) { - CapturedVars.clear(); - CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); - } - emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, RequiresOuterTask, + FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) { + FallbackGen(CGF); }; auto &&TargetThenGen = [this, &ThenGen, &D, &InputInfo, &MapTypesArray, diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -6289,6 +6289,13 @@ if (CGM.getLangOpts().OMPTargetTriples.empty()) IsOffloadEntry = false; + if (CGM.getLangOpts().OpenMPOffloadMandatory && !IsOffloadEntry) { + unsigned DiagID = CGM.getDiags().getCustomDiagID( + DiagnosticsEngine::Error, + "No offloading entry generated while offloading is mandatory."); + CGM.getDiags().Report(DiagID); + } + assert(CGF.CurFuncDecl && "No parent declaration for target region!"); StringRef ParentName; // In case we have Ctors/Dtors we use the complete type variant to produce diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5997,6 +5997,8 @@ CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) CmdArgs.push_back("-fopenmp-assume-no-thread-state"); + if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) + CmdArgs.push_back("-fopenmp-offload-mandatory"); break; default: // By default, if Clang doesn't know how to generate useful OpenMP code diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2517,7 +2517,7 @@ << HostDevTy; return; } - if (!LangOpts.OpenMPIsDevice && DevTy && + if (!LangOpts.OpenMPIsDevice && !LangOpts.OpenMPOffloadMandatory && DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) { // Diagnose nohost function called during host codegen. StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName( diff --git a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp @@ -0,0 +1,90 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-offload-mandatory -emit-llvm %s -o - | FileCheck %s --check-prefix=MANDATORY +// expected-no-diagnostics + +void foo() {} +#pragma omp declare target(foo) + +void bar() {} +#pragma omp declare target device_type(nohost) to(bar) + +void host() { +#pragma omp target + { bar(); } +} + +void host_if(bool cond) { +#pragma omp target if(cond) + { bar(); } +} + +void host_dev(int device) { +#pragma omp target device(device) + { bar(); } +} +// MANDATORY-LABEL: define {{[^@]+}}@_Z3foov +// MANDATORY-SAME: () #[[ATTR0:[0-9]+]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: ret void +// +// +// MANDATORY-LABEL: define {{[^@]+}}@_Z4hostv +// MANDATORY-SAME: () #[[ATTR0]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: [[TMP0:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4hostv_l12.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null) +// MANDATORY-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 +// MANDATORY-NEXT: br i1 [[TMP1]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// MANDATORY: omp_offload.failed: +// MANDATORY-NEXT: unreachable +// MANDATORY: omp_offload.cont: +// MANDATORY-NEXT: ret void +// +// +// MANDATORY-LABEL: define {{[^@]+}}@_Z7host_ifb +// MANDATORY-SAME: (i1 noundef zeroext [[COND:%.*]]) #[[ATTR0]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: [[COND_ADDR:%.*]] = alloca i8, align 1 +// MANDATORY-NEXT: [[FROMBOOL:%.*]] = zext i1 [[COND]] to i8 +// MANDATORY-NEXT: store i8 [[FROMBOOL]], i8* [[COND_ADDR]], align 1 +// MANDATORY-NEXT: [[TMP0:%.*]] = load i8, i8* [[COND_ADDR]], align 1 +// MANDATORY-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 +// MANDATORY-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// MANDATORY: omp_if.then: +// MANDATORY-NEXT: [[TMP1:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7host_ifb_l17.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null) +// MANDATORY-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +// MANDATORY-NEXT: br i1 [[TMP2]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// MANDATORY: omp_offload.failed: +// MANDATORY-NEXT: unreachable +// MANDATORY: omp_offload.cont: +// MANDATORY-NEXT: br label [[OMP_IF_END:%.*]] +// MANDATORY: omp_if.else: +// MANDATORY-NEXT: unreachable +// MANDATORY: omp_if.end: +// MANDATORY-NEXT: ret void +// +// +// MANDATORY-LABEL: define {{[^@]+}}@_Z8host_devi +// MANDATORY-SAME: (i32 noundef signext [[DEVICE:%.*]]) #[[ATTR0]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: [[DEVICE_ADDR:%.*]] = alloca i32, align 4 +// MANDATORY-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// MANDATORY-NEXT: store i32 [[DEVICE]], i32* [[DEVICE_ADDR]], align 4 +// MANDATORY-NEXT: [[TMP0:%.*]] = load i32, i32* [[DEVICE_ADDR]], align 4 +// MANDATORY-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// MANDATORY-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// MANDATORY-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 +// MANDATORY-NEXT: [[TMP3:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 [[TMP2]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8host_devi_l22.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null) +// MANDATORY-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +// MANDATORY-NEXT: br i1 [[TMP4]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// MANDATORY: omp_offload.failed: +// MANDATORY-NEXT: unreachable +// MANDATORY: omp_offload.cont: +// MANDATORY-NEXT: ret void +// +// +// MANDATORY-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg +// MANDATORY-SAME: () #[[ATTR3:[0-9]+]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: call void @__tgt_register_requires(i64 1) +// MANDATORY-NEXT: ret void +//