diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -247,6 +247,7 @@ LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") +LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2477,6 +2477,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag>; +def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group, + Flags<[CC1Option, NoArgumentUnused]>, + HelpText<"Do not create a host fallback if offloading to the device fails.">, + MarshallingInfoFlag>; defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime", LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue, PosFlag, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6538,6 +6538,8 @@ // mangled name of the function that encloses the target region and BB is the // line number of the target region. + const bool BuildOutlinedFn = CGM.getLangOpts().OpenMPIsDevice || + !CGM.getLangOpts().OpenMPOffloadMandatory; unsigned DeviceID; unsigned FileID; unsigned Line; @@ -6556,7 +6558,8 @@ CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); - OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc()); + if (BuildOutlinedFn) + OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc()); // If this target outline function is not an offload entry, we don't need to // register it. @@ -6588,9 +6591,20 @@ llvm::Constant::getNullValue(CGM.Int8Ty), Name); } + // If we do not allow host fallback we still need a named address to use. + llvm::Constant *TargetRegionEntryAddr = OutlinedFn; + if (!BuildOutlinedFn) { + assert(!CGM.getModule().getGlobalVariable(EntryFnName, true) && + "Named kernel already exists?"); + TargetRegionEntryAddr = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, + llvm::Constant::getNullValue(CGM.Int8Ty), EntryFnName); + } + // Register the information for the entry associated with this target region. OffloadEntriesInfoManager.registerTargetRegionEntryInfo( - DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID, + DeviceID, FileID, ParentName, Line, TargetRegionEntryAddr, OutlinedFnID, OffloadEntriesInfoManagerTy::OMPTargetRegionEntryTargetRegion); // Add NumTeams and ThreadLimit attributes to the outlined GPU function @@ -6607,7 +6621,8 @@ std::to_string(DefaultValThreads)); } - CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM); + if (BuildOutlinedFn) + CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM); } /// Checks if the expression is constant or does not have non-trivial function @@ -10324,7 +10339,10 @@ if (!CGF.HaveInsertPoint()) return; - assert(OutlinedFn && "Invalid outlined function!"); + const bool OffloadingMandatory = !CGM.getLangOpts().OpenMPIsDevice && + CGM.getLangOpts().OpenMPOffloadMandatory; + + assert((OffloadingMandatory || OutlinedFn) && "Invalid outlined function!"); const bool RequiresOuterTask = D.hasClausesOfKind() || D.hasClausesOfKind(); @@ -10339,18 +10357,28 @@ CodeGenFunction::OMPTargetDataInfo InputInfo; llvm::Value *MapTypesArray = nullptr; llvm::Value *MapNamesArray = nullptr; - // Fill up the pointer arrays and transfer execution to the device. - auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, - &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask, - &CapturedVars, - SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { - if (Device.getInt() == OMPC_DEVICE_ancestor) { - // Reverse offloading is not supported, so just execute on the host. + // Generate code for the host fallback function. + auto &&FallbackGen = [this, OutlinedFn, OutlinedFnID, &D, &CapturedVars, + RequiresOuterTask, &CS, + OffloadingMandatory](CodeGenFunction &CGF) { + if (OffloadingMandatory) { + CGF.Builder.CreateUnreachable(); + } else { if (RequiresOuterTask) { CapturedVars.clear(); CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); } emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + } + }; + // Fill up the pointer arrays and transfer execution to the device. + auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, + &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask, + &CapturedVars, SizeEmitter, + FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) { + if (Device.getInt() == OMPC_DEVICE_ancestor) { + // Reverse offloading is not supported, so just execute on the host. + FallbackGen(CGF); return; } @@ -10494,25 +10522,17 @@ CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); CGF.EmitBlock(OffloadFailedBlock); - if (RequiresOuterTask) { - CapturedVars.clear(); - CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); - } - emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + FallbackGen(CGF); + CGF.EmitBranch(OffloadContBlock); CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true); }; // Notify that the host version must be executed. - auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, - RequiresOuterTask](CodeGenFunction &CGF, - PrePostActionTy &) { - if (RequiresOuterTask) { - CapturedVars.clear(); - CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); - } - emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, RequiresOuterTask, + FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) { + FallbackGen(CGF); }; auto &&TargetThenGen = [this, &ThenGen, &D, &InputInfo, &MapTypesArray, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5997,6 +5997,8 @@ CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) CmdArgs.push_back("-fopenmp-assume-no-thread-state"); + if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) + CmdArgs.push_back("-fopenmp-offload-mandatory"); break; default: // By default, if Clang doesn't know how to generate useful OpenMP code diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2517,7 +2517,7 @@ << HostDevTy; return; } - if (!LangOpts.OpenMPIsDevice && DevTy && + if (!LangOpts.OpenMPIsDevice && !LangOpts.OpenMPOffloadMandatory && DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) { // Diagnose nohost function called during host codegen. StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName( diff --git a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp @@ -0,0 +1,32 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-offload-mandatory -emit-llvm %s -o - | FileCheck %s --check-prefix=MANDATORY +// expected-no-diagnostics + +int x; +#pragma omp declare target(x) + +void foo(int) {} +#pragma omp declare target device_type(nohost) to(foo) + +void host() { +#pragma omp target + { foo(x); } +} +// MANDATORY-LABEL: define {{[^@]+}}@_Z4hostv +// MANDATORY-SAME: () #[[ATTR1:[0-9]+]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: [[TMP0:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4hostv_l12.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null) +// MANDATORY-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 +// MANDATORY-NEXT: br i1 [[TMP1]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// MANDATORY: omp_offload.failed: +// MANDATORY-NEXT: unreachable +// MANDATORY: omp_offload.cont: +// MANDATORY-NEXT: ret void +// +// +// MANDATORY-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg +// MANDATORY-SAME: () #[[ATTR3:[0-9]+]] { +// MANDATORY-NEXT: entry: +// MANDATORY-NEXT: call void @__tgt_register_requires(i64 1) +// MANDATORY-NEXT: ret void +//