Index: clang/lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntime.h +++ clang/lib/CodeGen/CGOpenMPRuntime.h @@ -327,42 +327,6 @@ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); - /// Emits object of ident_t type with info for source location. - /// \param Flags Flags for OpenMP location. - /// \param EmitLoc emit source location with debug-info is off. - /// - llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, - unsigned Flags = 0, bool EmitLoc = false); - - /// Emit the number of teams for a target directive. Inspect the num_teams - /// clause associated with a teams construct combined or closely nested - /// with the target directive. - /// - /// Emit a team of size one for directives such as 'target parallel' that - /// have no associated teams construct. - /// - /// Otherwise, return nullptr. - const Expr *getNumTeamsExprForTargetDirective(CodeGenFunction &CGF, - const OMPExecutableDirective &D, - int32_t &DefaultVal); - llvm::Value *emitNumTeamsForTargetDirective(CodeGenFunction &CGF, - const OMPExecutableDirective &D); - /// Emit the number of threads for a target directive. Inspect the - /// thread_limit clause associated with a teams construct combined or closely - /// nested with the target directive. - /// - /// Emit the num_threads clause for directives such as 'target parallel' that - /// have no associated teams construct. - /// - /// Otherwise, return nullptr. - const Expr * - getNumThreadsExprForTargetDirective(CodeGenFunction &CGF, - const OMPExecutableDirective &D, - int32_t &DefaultVal); - llvm::Value * - emitNumThreadsForTargetDirective(CodeGenFunction &CGF, - const OMPExecutableDirective &D); - /// Returns pointer to ident_t type. llvm::Type *getIdentTyPointerTy(); @@ -656,15 +620,6 @@ llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds, const OMPTaskDataTy &Data); - /// Return the trip count of loops associated with constructs / 'target teams - /// distribute' and 'teams distribute parallel for'. \param SizeEmitter Emits - /// the int64 value for the number of iterations of the associated loop. - llvm::Value *emitTargetNumIterationsCall( - CodeGenFunction &CGF, const OMPExecutableDirective &D, - llvm::function_ref - SizeEmitter); - /// Emit update for lastprivate conditional data. void emitLastprivateConditionalUpdate(CodeGenFunction &CGF, LValue IVLVal, StringRef UniqueDeclName, LValue LVal, @@ -691,6 +646,51 @@ virtual ~CGOpenMPRuntime() {} virtual void clear(); + /// Emits object of ident_t type with info for source location. + /// \param Flags Flags for OpenMP location. + /// \param EmitLoc emit source location with debug-info is off. + /// + llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, + unsigned Flags = 0, bool EmitLoc = false); + + /// Emit the number of teams for a target directive. Inspect the num_teams + /// clause associated with a teams construct combined or closely nested + /// with the target directive. + /// + /// Emit a team of size one for directives such as 'target parallel' that + /// have no associated teams construct. + /// + /// Otherwise, return nullptr. + const Expr *getNumTeamsExprForTargetDirective(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + int32_t &DefaultVal); + llvm::Value *emitNumTeamsForTargetDirective(CodeGenFunction &CGF, + const OMPExecutableDirective &D); + /// Emit the number of threads for a target directive. Inspect the + /// thread_limit clause associated with a teams construct combined or closely + /// nested with the target directive. + /// + /// Emit the num_threads clause for directives such as 'target parallel' that + /// have no associated teams construct. + /// + /// Otherwise, return nullptr. + const Expr * + getNumThreadsExprForTargetDirective(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + int32_t &DefaultVal); + llvm::Value * + emitNumThreadsForTargetDirective(CodeGenFunction &CGF, + const OMPExecutableDirective &D); + + /// Return the trip count of loops associated with constructs / 'target teams + /// distribute' and 'teams distribute parallel for'. \param SizeEmitter Emits + /// the int64 value for the number of iterations of the associated loop. + llvm::Value *emitTargetNumIterationsCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::function_ref + SizeEmitter); + /// Returns true if the current target is a GPU. virtual bool isTargetCodegen() const { return false; } Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9590,6 +9590,247 @@ return llvm::ConstantInt::get(CGF.Int64Ty, 0); } +static void +emitTargetCallFallback(CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn, + const OMPExecutableDirective &D, + llvm::SmallVectorImpl &CapturedVars, + bool RequiresOuterTask, const CapturedStmt &CS, + bool OffloadingMandatory, CodeGenFunction &CGF) { + if (OffloadingMandatory) { + CGF.Builder.CreateUnreachable(); + } else { + if (RequiresOuterTask) { + CapturedVars.clear(); + CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); + } + OMPRuntime->emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, + CapturedVars); + } +} + +static llvm::Value *emitDeviceID( + llvm::PointerIntPair Device, + CodeGenFunction &CGF) { + // Emit device ID if any. + llvm::Value *DeviceID; + if (Device.getPointer()) { + assert((Device.getInt() == OMPC_DEVICE_unknown || + Device.getInt() == OMPC_DEVICE_device_num) && + "Expected device_num modifier."); + llvm::Value *DevVal = CGF.EmitScalarExpr(Device.getPointer()); + DeviceID = + CGF.Builder.CreateIntCast(DevVal, CGF.Int64Ty, /*isSigned=*/true); + } else { + DeviceID = CGF.Builder.getInt64(OMP_DEVICEID_UNDEF); + } + return DeviceID; +} + +llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D, + CodeGenFunction &CGF) { + llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0); + + if (auto *DynMemClause = D.getSingleClause()) { + CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF); + llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr( + DynMemClause->getSize(), /*IgnoreResultAssign=*/true); + DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty, + /*isSigned=*/false); + } + return DynCGroupMem; +} + +static void emitTargetCallKernelLaunch( + CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn, + const OMPExecutableDirective &D, + llvm::SmallVectorImpl &CapturedVars, bool RequiresOuterTask, + const CapturedStmt &CS, bool OffloadingMandatory, + llvm::PointerIntPair Device, + llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo, + llvm::Value *&MapTypesArray, llvm::Value *&MapNamesArray, + llvm::function_ref + SizeEmitter, + CodeGenFunction &CGF, CodeGenModule &CGM) { + llvm::OpenMPIRBuilder &OMPBuilder = OMPRuntime->getOMPBuilder(); + + // Fill up the arrays with all the captured variables. + MappableExprsHandler::MapCombinedInfoTy CombinedInfo; + + // Get mappable expression information. + MappableExprsHandler MEHandler(D, CGF); + llvm::DenseMap LambdaPointers; + llvm::DenseSet> MappedVarSet; + + auto RI = CS.getCapturedRecordDecl()->field_begin(); + auto *CV = CapturedVars.begin(); + for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(), + CE = CS.capture_end(); + CI != CE; ++CI, ++RI, ++CV) { + MappableExprsHandler::MapCombinedInfoTy CurInfo; + MappableExprsHandler::StructRangeInfoTy PartialStruct; + + // VLA sizes are passed to the outlined region by copy and do not have map + // information associated. + if (CI->capturesVariableArrayType()) { + CurInfo.Exprs.push_back(nullptr); + CurInfo.BasePointers.push_back(*CV); + CurInfo.DevicePtrDecls.push_back(nullptr); + CurInfo.Pointers.push_back(*CV); + CurInfo.Sizes.push_back(CGF.Builder.CreateIntCast( + CGF.getTypeSize(RI->getType()), CGF.Int64Ty, /*isSigned=*/true)); + // Copy to the device as an argument. No need to retrieve it. + CurInfo.Types.push_back(OpenMPOffloadMappingFlags::OMP_MAP_LITERAL | + OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM | + OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT); + CurInfo.Mappers.push_back(nullptr); + } else { + // If we have any information in the map clause, we use it, otherwise we + // just do a default mapping. + MEHandler.generateInfoForCapture(CI, *CV, CurInfo, PartialStruct); + if (!CI->capturesThis()) + MappedVarSet.insert(CI->getCapturedVar()); + else + MappedVarSet.insert(nullptr); + if (CurInfo.BasePointers.empty() && !PartialStruct.Base.isValid()) + MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurInfo); + // Generate correct mapping for variables captured by reference in + // lambdas. + if (CI->capturesVariable()) + MEHandler.generateInfoForLambdaCaptures(CI->getCapturedVar(), *CV, + CurInfo, LambdaPointers); + } + // We expect to have at least an element of information for this capture. + assert((!CurInfo.BasePointers.empty() || PartialStruct.Base.isValid()) && + "Non-existing map pointer for capture!"); + assert(CurInfo.BasePointers.size() == CurInfo.Pointers.size() && + CurInfo.BasePointers.size() == CurInfo.Sizes.size() && + CurInfo.BasePointers.size() == CurInfo.Types.size() && + CurInfo.BasePointers.size() == CurInfo.Mappers.size() && + "Inconsistent map information sizes!"); + + // If there is an entry in PartialStruct it means we have a struct with + // individual members mapped. Emit an extra combined entry. + if (PartialStruct.Base.isValid()) { + CombinedInfo.append(PartialStruct.PreliminaryMapData); + MEHandler.emitCombinedEntry( + CombinedInfo, CurInfo.Types, PartialStruct, CI->capturesThis(), + nullptr, !PartialStruct.PreliminaryMapData.BasePointers.empty()); + } + + // We need to append the results of this capture to what we already have. + CombinedInfo.append(CurInfo); + } + // Adjust MEMBER_OF flags for the lambdas captures. + MEHandler.adjustMemberOfForLambdaCaptures( + LambdaPointers, CombinedInfo.BasePointers, CombinedInfo.Pointers, + CombinedInfo.Types); + // Map any list items in a map clause that were not captures because they + // weren't referenced within the construct. + MEHandler.generateAllInfo(CombinedInfo, MappedVarSet); + + CGOpenMPRuntime::TargetDataInfo Info; + // Fill up the arrays and create the arguments. + emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder); + bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() != + llvm::codegenoptions::NoDebugInfo; + OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info, + EmitDebug, + /*ForEndCall=*/false); + + InputInfo.NumberOfTargetItems = Info.NumberOfPtrs; + InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray, + CGF.VoidPtrTy, CGM.getPointerAlign()); + InputInfo.PointersArray = + Address(Info.RTArgs.PointersArray, CGF.VoidPtrTy, CGM.getPointerAlign()); + InputInfo.SizesArray = + Address(Info.RTArgs.SizesArray, CGF.Int64Ty, CGM.getPointerAlign()); + InputInfo.MappersArray = + Address(Info.RTArgs.MappersArray, CGF.VoidPtrTy, CGM.getPointerAlign()); + MapTypesArray = Info.RTArgs.MapTypesArray; + MapNamesArray = Info.RTArgs.MapNamesArray; + + auto &&ThenGen = [&OMPRuntime, OutlinedFn, &D, &CapturedVars, + RequiresOuterTask, &CS, OffloadingMandatory, Device, + OutlinedFnID, &InputInfo, &MapTypesArray, &MapNamesArray, + SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { + bool IsReverseOffloading = Device.getInt() == OMPC_DEVICE_ancestor; + + if (IsReverseOffloading) { + // Reverse offloading is not supported, so just execute on the host. + emitTargetCallFallback(OMPRuntime, OutlinedFn, D, CapturedVars, + RequiresOuterTask, CS, OffloadingMandatory, CGF); + return; + } + + bool HasNoWait = D.hasClausesOfKind(); + unsigned numTargetItems = InputInfo.NumberOfTargetItems; + + llvm::Value *basePointersArray = InputInfo.BasePointersArray.getPointer(); + llvm::Value *pointersArray = InputInfo.PointersArray.getPointer(); + llvm::Value *sizesArray = InputInfo.SizesArray.getPointer(); + llvm::Value *mappersArray = InputInfo.MappersArray.getPointer(); + + auto &&emitTargetCallFallbackCB = + [&OMPRuntime, OutlinedFn, &D, &CapturedVars, RequiresOuterTask, &CS, + OffloadingMandatory, &CGF](llvm::OpenMPIRBuilder::InsertPointTy IP) + -> llvm::OpenMPIRBuilder::InsertPointTy { + CGF.Builder.restoreIP(IP); + emitTargetCallFallback(OMPRuntime, OutlinedFn, D, CapturedVars, + RequiresOuterTask, CS, OffloadingMandatory, CGF); + return CGF.Builder.saveIP(); + }; + + llvm::Value *DeviceID = emitDeviceID(Device, CGF); + llvm::Value *NumTeams = OMPRuntime->emitNumTeamsForTargetDirective(CGF, D); + llvm::Value *NumThreads = + OMPRuntime->emitNumThreadsForTargetDirective(CGF, D); + llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc()); + llvm::Value *NumIterations = + OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter); + llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF); + llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( + CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator()); + + llvm::TargetKernelArgs Args(numTargetItems, basePointersArray, + pointersArray, sizesArray, MapTypesArray, + MapNamesArray, mappersArray, NumIterations, + NumTeams, NumThreads, DynCGGroupMem, HasNoWait); + + CGF.Builder.restoreIP(OMPRuntime->getOMPBuilder().emitKernelLaunch( + CGF.Builder, OutlinedFn, OutlinedFnID, emitTargetCallFallbackCB, Args, + DeviceID, RTLoc, AllocaIP)); + }; + + if (RequiresOuterTask) + CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo); + else + OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen); +} + +static void +emitTargetCallElse(CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn, + const OMPExecutableDirective &D, + llvm::SmallVectorImpl &CapturedVars, + bool RequiresOuterTask, const CapturedStmt &CS, + bool OffloadingMandatory, CodeGenFunction &CGF) { + + // Notify that the host version must be executed. + auto &&ElseGen = + [&OMPRuntime, OutlinedFn, &D, &CapturedVars, RequiresOuterTask, &CS, + OffloadingMandatory](CodeGenFunction &CGF, PrePostActionTy &) { + emitTargetCallFallback(OMPRuntime, OutlinedFn, D, CapturedVars, + RequiresOuterTask, CS, OffloadingMandatory, CGF); + }; + + if (RequiresOuterTask) { + CodeGenFunction::OMPTargetDataInfo InputInfo; + CGF.EmitOMPTargetTaskBasedDirective(D, ElseGen, InputInfo); + } else { + OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ElseGen); + } +} + void CGOpenMPRuntime::emitTargetCall( CodeGenFunction &CGF, const OMPExecutableDirective &D, llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, @@ -9619,263 +9860,24 @@ CodeGenFunction::OMPTargetDataInfo InputInfo; llvm::Value *MapTypesArray = nullptr; llvm::Value *MapNamesArray = nullptr; - // Generate code for the host fallback function. - auto &&FallbackGen = [this, OutlinedFn, &D, &CapturedVars, RequiresOuterTask, - &CS, OffloadingMandatory](CodeGenFunction &CGF) { - if (OffloadingMandatory) { - CGF.Builder.CreateUnreachable(); - } else { - if (RequiresOuterTask) { - CapturedVars.clear(); - CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); - } - emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); - } - }; - // Fill up the pointer arrays and transfer execution to the device. - auto &&ThenGen = [this, Device, OutlinedFnID, &D, &InputInfo, &MapTypesArray, - &MapNamesArray, SizeEmitter, - FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) { - if (Device.getInt() == OMPC_DEVICE_ancestor) { - // Reverse offloading is not supported, so just execute on the host. - FallbackGen(CGF); - return; - } - - // On top of the arrays that were filled up, the target offloading call - // takes as arguments the device id as well as the host pointer. The host - // pointer is used by the runtime library to identify the current target - // region, so it only has to be unique and not necessarily point to - // anything. It could be the pointer to the outlined function that - // implements the target region, but we aren't using that so that the - // compiler doesn't need to keep that, and could therefore inline the host - // function if proven worthwhile during optimization. - - // From this point on, we need to have an ID of the target region defined. - assert(OutlinedFnID && "Invalid outlined function ID!"); - (void)OutlinedFnID; - // Emit device ID if any. - llvm::Value *DeviceID; - if (Device.getPointer()) { - assert((Device.getInt() == OMPC_DEVICE_unknown || - Device.getInt() == OMPC_DEVICE_device_num) && - "Expected device_num modifier."); - llvm::Value *DevVal = CGF.EmitScalarExpr(Device.getPointer()); - DeviceID = - CGF.Builder.CreateIntCast(DevVal, CGF.Int64Ty, /*isSigned=*/true); - } else { - DeviceID = CGF.Builder.getInt64(OMP_DEVICEID_UNDEF); - } - - // Emit the number of elements in the offloading arrays. - llvm::Value *PointerNum = - CGF.Builder.getInt32(InputInfo.NumberOfTargetItems); - - // Return value of the runtime offloading call. - llvm::Value *Return; - - llvm::Value *NumTeams = emitNumTeamsForTargetDirective(CGF, D); - llvm::Value *NumThreads = emitNumThreadsForTargetDirective(CGF, D); - - // Source location for the ident struct - llvm::Value *RTLoc = emitUpdateLocation(CGF, D.getBeginLoc()); - - // Get tripcount for the target loop-based directive. - llvm::Value *NumIterations = - emitTargetNumIterationsCall(CGF, D, SizeEmitter); - - llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0); - if (auto *DynMemClause = D.getSingleClause()) { - CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF); - llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr( - DynMemClause->getSize(), /*IgnoreResultAssign=*/true); - DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty, - /*isSigned=*/false); - } - - llvm::Value *ZeroArray = - llvm::Constant::getNullValue(llvm::ArrayType::get(CGF.CGM.Int32Ty, 3)); - - bool HasNoWait = D.hasClausesOfKind(); - llvm::Value *Flags = CGF.Builder.getInt64(HasNoWait); - - llvm::Value *NumTeams3D = - CGF.Builder.CreateInsertValue(ZeroArray, NumTeams, {0}); - llvm::Value *NumThreads3D = - CGF.Builder.CreateInsertValue(ZeroArray, NumThreads, {0}); - - // Arguments for the target kernel. - SmallVector KernelArgs{ - CGF.Builder.getInt32(/* Version */ 2), - PointerNum, - InputInfo.BasePointersArray.getPointer(), - InputInfo.PointersArray.getPointer(), - InputInfo.SizesArray.getPointer(), - MapTypesArray, - MapNamesArray, - InputInfo.MappersArray.getPointer(), - NumIterations, - Flags, - NumTeams3D, - NumThreads3D, - DynCGroupMem, - }; - - llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( - CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator()); - - // The target region is an outlined function launched by the runtime - // via calls to __tgt_target_kernel(). - // - // Note that on the host and CPU targets, the runtime implementation of - // these calls simply call the outlined function without forking threads. - // The outlined functions themselves have runtime calls to - // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by - // the compiler in emitTeamsCall() and emitParallelCall(). - // - // In contrast, on the NVPTX target, the implementation of - // __tgt_target_teams() launches a GPU kernel with the requested number - // of teams and threads so no additional calls to the runtime are required. - // Check the error code and execute the host version if required. - CGF.Builder.restoreIP(OMPBuilder.emitTargetKernel( - CGF.Builder, AllocaIP, Return, RTLoc, DeviceID, NumTeams, NumThreads, - OutlinedFnID, KernelArgs)); - - llvm::BasicBlock *OffloadFailedBlock = - CGF.createBasicBlock("omp_offload.failed"); - llvm::BasicBlock *OffloadContBlock = - CGF.createBasicBlock("omp_offload.cont"); - llvm::Value *Failed = CGF.Builder.CreateIsNotNull(Return); - CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); - - CGF.EmitBlock(OffloadFailedBlock); - FallbackGen(CGF); - - CGF.EmitBranch(OffloadContBlock); - - CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true); - }; - - // Notify that the host version must be executed. - auto &&ElseGen = [FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) { - FallbackGen(CGF); - }; - - auto &&TargetThenGen = [this, &ThenGen, &D, &InputInfo, &MapTypesArray, - &MapNamesArray, &CapturedVars, RequiresOuterTask, - &CS](CodeGenFunction &CGF, PrePostActionTy &) { - // Fill up the arrays with all the captured variables. - MappableExprsHandler::MapCombinedInfoTy CombinedInfo; - - // Get mappable expression information. - MappableExprsHandler MEHandler(D, CGF); - llvm::DenseMap LambdaPointers; - llvm::DenseSet> MappedVarSet; - - auto RI = CS.getCapturedRecordDecl()->field_begin(); - auto *CV = CapturedVars.begin(); - for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(), - CE = CS.capture_end(); - CI != CE; ++CI, ++RI, ++CV) { - MappableExprsHandler::MapCombinedInfoTy CurInfo; - MappableExprsHandler::StructRangeInfoTy PartialStruct; - - // VLA sizes are passed to the outlined region by copy and do not have map - // information associated. - if (CI->capturesVariableArrayType()) { - CurInfo.Exprs.push_back(nullptr); - CurInfo.BasePointers.push_back(*CV); - CurInfo.DevicePtrDecls.push_back(nullptr); - CurInfo.Pointers.push_back(*CV); - CurInfo.Sizes.push_back(CGF.Builder.CreateIntCast( - CGF.getTypeSize(RI->getType()), CGF.Int64Ty, /*isSigned=*/true)); - // Copy to the device as an argument. No need to retrieve it. - CurInfo.Types.push_back( - OpenMPOffloadMappingFlags::OMP_MAP_LITERAL | - OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM | - OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT); - CurInfo.Mappers.push_back(nullptr); - } else { - // If we have any information in the map clause, we use it, otherwise we - // just do a default mapping. - MEHandler.generateInfoForCapture(CI, *CV, CurInfo, PartialStruct); - if (!CI->capturesThis()) - MappedVarSet.insert(CI->getCapturedVar()); - else - MappedVarSet.insert(nullptr); - if (CurInfo.BasePointers.empty() && !PartialStruct.Base.isValid()) - MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurInfo); - // Generate correct mapping for variables captured by reference in - // lambdas. - if (CI->capturesVariable()) - MEHandler.generateInfoForLambdaCaptures(CI->getCapturedVar(), *CV, - CurInfo, LambdaPointers); - } - // We expect to have at least an element of information for this capture. - assert((!CurInfo.BasePointers.empty() || PartialStruct.Base.isValid()) && - "Non-existing map pointer for capture!"); - assert(CurInfo.BasePointers.size() == CurInfo.Pointers.size() && - CurInfo.BasePointers.size() == CurInfo.Sizes.size() && - CurInfo.BasePointers.size() == CurInfo.Types.size() && - CurInfo.BasePointers.size() == CurInfo.Mappers.size() && - "Inconsistent map information sizes!"); - - // If there is an entry in PartialStruct it means we have a struct with - // individual members mapped. Emit an extra combined entry. - if (PartialStruct.Base.isValid()) { - CombinedInfo.append(PartialStruct.PreliminaryMapData); - MEHandler.emitCombinedEntry( - CombinedInfo, CurInfo.Types, PartialStruct, CI->capturesThis(), - nullptr, !PartialStruct.PreliminaryMapData.BasePointers.empty()); - } - - // We need to append the results of this capture to what we already have. - CombinedInfo.append(CurInfo); - } - // Adjust MEMBER_OF flags for the lambdas captures. - MEHandler.adjustMemberOfForLambdaCaptures( - LambdaPointers, CombinedInfo.BasePointers, CombinedInfo.Pointers, - CombinedInfo.Types); - // Map any list items in a map clause that were not captures because they - // weren't referenced within the construct. - MEHandler.generateAllInfo(CombinedInfo, MappedVarSet); - - CGOpenMPRuntime::TargetDataInfo Info; - // Fill up the arrays and create the arguments. - emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder); - bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() != - llvm::codegenoptions::NoDebugInfo; - OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info, - EmitDebug, - /*ForEndCall=*/false); - - InputInfo.NumberOfTargetItems = Info.NumberOfPtrs; - InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray, - CGF.VoidPtrTy, CGM.getPointerAlign()); - InputInfo.PointersArray = Address(Info.RTArgs.PointersArray, CGF.VoidPtrTy, - CGM.getPointerAlign()); - InputInfo.SizesArray = - Address(Info.RTArgs.SizesArray, CGF.Int64Ty, CGM.getPointerAlign()); - InputInfo.MappersArray = - Address(Info.RTArgs.MappersArray, CGF.VoidPtrTy, CGM.getPointerAlign()); - MapTypesArray = Info.RTArgs.MapTypesArray; - MapNamesArray = Info.RTArgs.MapNamesArray; - if (RequiresOuterTask) - CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo); - else - emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen); + auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars, + RequiresOuterTask, &CS, OffloadingMandatory, Device, + OutlinedFnID, &InputInfo, &MapTypesArray, + &MapNamesArray, SizeEmitter](CodeGenFunction &CGF, + PrePostActionTy &) { + emitTargetCallKernelLaunch(this, OutlinedFn, D, CapturedVars, + RequiresOuterTask, CS, OffloadingMandatory, + Device, OutlinedFnID, InputInfo, MapTypesArray, + MapNamesArray, SizeEmitter, CGF, CGM); }; - auto &&TargetElseGen = [this, &ElseGen, &D, RequiresOuterTask]( - CodeGenFunction &CGF, PrePostActionTy &) { - if (RequiresOuterTask) { - CodeGenFunction::OMPTargetDataInfo InputInfo; - CGF.EmitOMPTargetTaskBasedDirective(D, ElseGen, InputInfo); - } else { - emitInlinedDirective(CGF, D.getDirectiveKind(), ElseGen); - } - }; + auto &&TargetElseGen = + [this, OutlinedFn, &D, &CapturedVars, RequiresOuterTask, &CS, + OffloadingMandatory](CodeGenFunction &CGF, PrePostActionTy &) { + emitTargetCallElse(this, OutlinedFn, D, CapturedVars, RequiresOuterTask, + CS, OffloadingMandatory, CGF); + }; // If we have a target function ID it means that we need to support // offloading, otherwise, just execute on the host. We need to execute on host Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -405,6 +405,35 @@ OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar; }; +struct TargetKernelArgs { + unsigned NumTargetItems; + Value *BasePointersArray; + Value *PointersArray; + Value *SizesArray; + Value *MapTypesArray; + Value *MapNamesArray; + Value *MappersArray; + Value *NumIterations; + Value *NumTeams; + Value *NumThreads; + Value *DynCGGroupMem; + bool HasNoWait; + + SmallVector getKernelArgsVector(IRBuilderBase &Builder); + + TargetKernelArgs(unsigned NumTargetItems, Value *BasePointersArray, + Value *PointersArray, Value *SizesArray, + Value *MapTypesArray, Value *MapNamesArray, + Value *MappersArray, Value *NumIterations, Value *NumTeams, + Value *NumThreads, Value *DynCGGroupMem, bool HasNoWait) + : NumTargetItems(NumTargetItems), BasePointersArray(BasePointersArray), + PointersArray(PointersArray), SizesArray(SizesArray), + MapTypesArray(MapTypesArray), MapNamesArray(MapNamesArray), + MappersArray(MappersArray), NumIterations(NumIterations), + NumTeams(NumTeams), NumThreads(NumThreads), + DynCGGroupMem(DynCGGroupMem), HasNoWait(HasNoWait) {} +}; + /// An interface to create LLVM-IR for OpenMP directives. /// /// Each OpenMP directive has a corresponding public generator method. @@ -1248,6 +1277,24 @@ Value *NumThreads, Value *HostPtr, ArrayRef KernelArgs); + using EmitFallbackCallbackTy = function_ref; + + /// Generate a target region entry call and host fallback call. + /// + /// \param Loc The location at which the request originated and is fulfilled. + /// \param OutlinedFn The outlined kernel function. + /// \param OutlinedFnID The ooulined function ID. + /// \param emitTargetCallFallbackCB Call back function to generate host + /// fallback code. + /// \param Args Data structure holding information about the kernel arguments. + /// \param DeviceID Identifier for the device via the 'device' clause. + /// \param RTLoc Source location identifier + /// \param AllocaIP The insertion point to be used for alloca instructions. + InsertPointTy emitKernelLaunch( + const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, + EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args, + Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP); + /// Generate a barrier runtime call. /// /// \param Loc The location at which the request originated and is fulfilled. Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp =================================================================== --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -328,6 +328,24 @@ return splitBB(Builder, CreateBranch, Old->getName() + Suffix); } +SmallVector +TargetKernelArgs::getKernelArgsVector(IRBuilderBase &Builder) { + Value *Version = Builder.getInt32(/* Version */ 2); + Value *PointerNum = Builder.getInt32(NumTargetItems); + auto Int32Ty = Type::getInt32Ty(Builder.getContext()); + Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3)); + Value *Flags = Builder.getInt64(HasNoWait); + + Value *NumTeams3D = Builder.CreateInsertValue(ZeroArray, NumTeams, {0}); + Value *NumThreads3D = Builder.CreateInsertValue(ZeroArray, NumThreads, {0}); + + return SmallVector{Version, PointerNum, BasePointersArray, + PointersArray, SizesArray, MapTypesArray, + MapNamesArray, MappersArray, NumIterations, + Flags, NumTeams3D, NumThreads3D, + DynCGGroupMem}; +} + void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { LLVMContext &Ctx = Fn.getContext(); Triple T(M.getTargetTriple()); @@ -842,6 +860,103 @@ return Builder.saveIP(); } +static void EmitBranch(IRBuilderBase &Builder, BasicBlock *Target) { + // Emit a branch from the current block to the target one if this + // was a real block. If this was just a fall-through block after a + // terminator, don't emit it. + BasicBlock *CurBB = Builder.GetInsertBlock(); + + if (!CurBB || CurBB->getTerminator()) { + // If there is no insert point or the previous block is already + // terminated, don't touch it. + } else { + // Otherwise, create a fall-through branch. + Builder.CreateBr(Target); + } + + Builder.ClearInsertionPoint(); +} + +static void EmitBlock(IRBuilderBase &Builder, BasicBlock *BB, Function *CurFn, + bool IsFinished = false) { + BasicBlock *CurBB = Builder.GetInsertBlock(); + // Fall out of the current block (if necessary). + EmitBranch(Builder, BB); + + if (IsFinished && BB->use_empty()) { + delete BB; + return; + } + + // Place the block after the current block, if possible, or else at + // the end of the function. + if (CurBB && CurBB->getParent()) + CurFn->insert(std::next(CurBB->getIterator()), BB); + else + CurFn->insert(CurFn->end(), BB); + Builder.SetInsertPoint(BB); +} + +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitKernelLaunch( + const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, + EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args, + Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) { + + if (!updateToLocation(Loc)) + return Loc.IP; + + Builder.restoreIP(Loc.IP); + // On top of the arrays that were filled up, the target offloading call + // takes as arguments the device id as well as the host pointer. The host + // pointer is used by the runtime library to identify the current target + // region, so it only has to be unique and not necessarily point to + // anything. It could be the pointer to the outlined function that + // implements the target region, but we aren't using that so that the + // compiler doesn't need to keep that, and could therefore inline the host + // function if proven worthwhile during optimization. + + // From this point on, we need to have an ID of the target region defined. + assert(OutlinedFnID && "Invalid outlined function ID!"); + (void)OutlinedFnID; + + // Return value of the runtime offloading call. + Value *Return; + + // Arguments for the target kernel. + SmallVector ArgsVector(Args.getKernelArgsVector(Builder)); + + // The target region is an outlined function launched by the runtime + // via calls to __tgt_target_kernel(). + // + // Note that on the host and CPU targets, the runtime implementation of + // these calls simply call the outlined function without forking threads. + // The outlined functions themselves have runtime calls to + // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by + // the compiler in emitTeamsCall() and emitParallelCall(). + // + // In contrast, on the NVPTX target, the implementation of + // __tgt_target_teams() launches a GPU kernel with the requested number + // of teams and threads so no additional calls to the runtime are required. + // Check the error code and execute the host version if required. + Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID, + Args.NumTeams, Args.NumThreads, + OutlinedFnID, ArgsVector)); + + BasicBlock *OffloadFailedBlock = + BasicBlock::Create(Builder.getContext(), "omp_offload.failed"); + BasicBlock *OffloadContBlock = + BasicBlock::Create(Builder.getContext(), "omp_offload.cont"); + Value *Failed = Builder.CreateIsNotNull(Return); + Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); + + auto CurFn = Builder.GetInsertBlock()->getParent(); + EmitBlock(Builder, OffloadFailedBlock, CurFn); + Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP())); + EmitBranch(Builder, OffloadContBlock); + EmitBlock(Builder, OffloadContBlock, CurFn, /*IsFinished=*/true); + return Builder.saveIP(); +} + void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB) {