Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -286,6 +286,7 @@ OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__) __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr) +__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr) #undef __OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE @@ -570,6 +571,9 @@ VoidPtrPtr, Int64Ptr, Int64Ptr) __OMP_RTL(__tgt_target_data_begin_nowait, false, Void, Int64, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr) +__OMP_RTL(__tgt_target_data_begin_issue, false, AsyncInfo, Int64, Int32, VoidPtrPtr, + VoidPtrPtr, Int64Ptr, Int64Ptr) +__OMP_RTL(__tgt_target_data_begin_wait, false, Void, Int64, AsyncInfo) __OMP_RTL(__tgt_target_data_end, false, Void, Int64, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr) __OMP_RTL(__tgt_target_data_end_nowait, false, Void, Int64, Int32, VoidPtrPtr, Index: llvm/include/llvm/Transforms/IPO/OpenMPOpt.h =================================================================== --- llvm/include/llvm/Transforms/IPO/OpenMPOpt.h +++ llvm/include/llvm/Transforms/IPO/OpenMPOpt.h @@ -175,20 +175,23 @@ bool isFilled(); }; - CallBase *RuntimeCall; /// Call that involves a memotry transfer. - InformationCache &InfoCache; + CallInst *RuntimeCall; /// Call that involves a memotry transfer. + OMPInformationCache &InfoCache; /// These help mapping the values in offload_baseptrs, offload_ptrs, and /// offload_sizes, respectively. + const unsigned BasePtrsArgNum = 2; std::unique_ptr BasePtrs = nullptr; + const unsigned PtrsArgNum = 3; std::unique_ptr Ptrs = nullptr; + const unsigned SizesArgNum = 4; std::unique_ptr Sizes = nullptr; /// Set of instructions that compose the argument setup for the call /// RuntimeCall. SetVector Issue; - MemoryTransfer(CallBase *RuntimeCall, InformationCache &InfoCache) : + MemoryTransfer(CallInst *RuntimeCall, OMPInformationCache &InfoCache) : RuntimeCall{RuntimeCall}, InfoCache{InfoCache} {} @@ -207,6 +210,11 @@ /// offload arrays. bool mayBeModifiedBy(Instruction *I); + /// Splits this object into its "issue" and "wait" corresponding runtime + /// calls. The "issue" is moved after \p After and the "wait" is moved + /// before \p Before. + bool split(Instruction *After, Instruction *Before); + private: /// Gets the setup instructions for each of the values in \p OA. These /// instructions are stored into Issue. @@ -218,6 +226,10 @@ /// Returns true if \p I may modify one of the values in \p Values. bool mayModify(Instruction *I, SmallVectorImpl &Values); + + /// Removes from the function all the instructions in Issue and inserts + /// them after \p After. + void moveIssue(Instruction *After); }; /// The slice of the module we are allowed to look at. @@ -301,6 +313,10 @@ /// moved. Returns nullptr if the movement is not possible, or not worth it. Instruction *canBeMovedUpwards(MemoryTransfer &MT); + /// Returns a pointer to the instruction where the "wait" of \p MT can be + /// moved. Returns nullptr if the movement is not possible, or not worth it. + Instruction *canBeMovedDownwards(MemoryTransfer &MT); + static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, bool GlobalOnly, bool &SingleChoice); Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -44,6 +44,11 @@ static cl::opt PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden); +static cl::opt SplitMemoryTransfers( + "openmp-split-memtransfers", + cl::desc("Tries to hide the latency of host to device memory transfers"), + cl::Hidden, cl::init(false)); + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -253,13 +258,10 @@ // arrays, offload_baseptrs, offload_ptrs, offload_sizes. // Therefore: // i8** %offload_baseptrs. - const unsigned BasePtrsArgNum = 2; Use *BasePtrsArg = RuntimeCall->arg_begin() + BasePtrsArgNum; // i8** %offload_ptrs. - const unsigned PtrsArgNum = 3; Use *PtrsArg = RuntimeCall->arg_begin() + PtrsArgNum; // i8** %offload_sizes. - const unsigned SizesArgNum = 4; Use *SizesArg = RuntimeCall->arg_begin() + SizesArgNum; const DataLayout &DL = InfoCache.getDL(); @@ -337,6 +339,9 @@ << RuntimeCall->getCaller()->getName() << "\n"); return false; } + auto *BasePtrsGEP = + cast(RuntimeCall->getArgOperand(BasePtrsArgNum)); + Issue.insert(BasePtrsGEP); Success = getSetupInstructions(Ptrs); if (!Success) { @@ -346,6 +351,9 @@ << RuntimeCall->getCaller()->getName() << "\n"); return false; } + auto *PtrsGEP = + cast(RuntimeCall->getArgOperand(PtrsArgNum)); + Issue.insert(PtrsGEP); if (Sizes) { Success = getSetupInstructions(Sizes); @@ -356,6 +364,9 @@ << RuntimeCall->getCaller()->getName() << "\n"); return false; } + auto *SizesGEP = + cast(RuntimeCall->getArgOperand(SizesArgNum)); + Issue.insert(SizesGEP); } return true; @@ -495,6 +506,65 @@ return true; } +bool MemoryTransfer::split(Instruction *After, Instruction *Before) { + assert((After || Before) && + "Must have a place to move the split runtime call"); + + auto *M = RuntimeCall->getModule(); + auto &IRBuilder = InfoCache.OMPBuilder; + // Add "issue" runtime call declaration. + // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, + // i8**, i8**, i64*, i64*) + FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( + *M, OMPRTL___tgt_target_data_begin_issue); + + // Change RuntimeCall callsite for its asynchronous version. + std::vector Args; + Args.reserve(RuntimeCall->getNumArgOperands()); + for (auto &Arg : RuntimeCall->args()) + Args.push_back(Arg.get()); + + CallInst *IssueCallsite = CallInst::Create( + IssueDecl, ArrayRef(Args), "handle", RuntimeCall); + RuntimeCall->removeFromParent(); + RuntimeCall->deleteValue(); + Issue.insert(IssueCallsite); + + // Add "wait" runtime call declaration. + // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) + FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( + *M, OMPRTL___tgt_target_data_begin_wait); + + // Add "wait" call site. + const unsigned WaitNumParams = 2; + Value *WaitParams[] = { + IssueCallsite->getArgOperand(0), // device_id. + IssueCallsite // returned handle. + }; + CallInst *WaitCallsite = CallInst::Create( + WaitDecl, ArrayRef(WaitParams, WaitNumParams), /*NameStr=*/"", + /*InsertBefore=*/(Instruction *)nullptr); + + // Move wait. + if (!Before) + WaitCallsite->insertAfter(IssueCallsite); + else + WaitCallsite->insertBefore(Before); + + if (After) + moveIssue(After); + + return true; +} + +void MemoryTransfer::moveIssue(Instruction *After) { + Instruction *Before = After->getNextNode(); + for (auto *I : Issue) { + I->removeFromParent(); + I->insertBefore(Before); + } +} + std::unique_ptr OffloadArray::initialize( AllocaInst &Array, Instruction &Before, InformationCache &InfoCache) { if (!Array.getAllocatedType()->isArrayTy()) { @@ -802,7 +872,8 @@ Changed |= runAttributor(); Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); - Changed |= hideMemTransfersLatency(); + if (SplitMemoryTransfers) + Changed |= hideMemTransfersLatency(); return Changed; } @@ -945,10 +1016,9 @@ return false; } - if (auto *I = canBeMovedUpwards(MT)) { - // TODO: Split call and move "issue" below I. - } - return false; + auto *After = canBeMovedUpwards(MT); + auto *Before = canBeMovedDownwards(MT); + return (After || Before) && MT.split(After, Before); }; RFI.foreachUse(SplitDataTransfer); @@ -958,7 +1028,7 @@ Instruction *OpenMPOpt::canBeMovedUpwards(MemoryTransfer &MT) { assert(MT.Issue.size() > 0 && "There's not set of instructions to be moved!"); - CallBase *RC = MT.RuntimeCall; + CallInst *RC = MT.RuntimeCall; auto *MSSAResult = OMPInfoCache.getAnalysisResultForFunction( *RC->getCaller()); @@ -978,8 +1048,13 @@ continue; auto *MemInst = (cast(MemAccess))->getMemoryInst(); - if (MT.mayBeModifiedBy(MemInst)) - return MemInst; + if (MT.mayBeModifiedBy(MemInst)) { + // If MemInst is not the instruction immediately before the Issue. + if (!MT.Issue.count(MemInst->getNextNode())) + return MemInst; + + return nullptr; + } MemAccess = MSSAWalker->getClobberingMemoryAccess(MemAccess); } @@ -987,6 +1062,34 @@ return nullptr; } +Instruction *OpenMPOpt::canBeMovedDownwards(MemoryTransfer &MT) { + assert(MT.Issue.size() > 0 && "There's not set of instructions to be moved!"); + + // FIXME: This traverses only the BasicBlock where MT is. Make it traverse + // the CFG. + GlobalValue *TgtTargetDecl = M.getNamedValue("__tgt_target"); + GlobalValue *TgtTargetTeamsDecl = M.getNamedValue("__tgt_target_teams"); + GlobalValue *TgtTargetDataEndDecl = M.getNamedValue("__tgt_target_data_end"); + CallInst *RC = MT.RuntimeCall; + auto *I = RC->getNextNode(); + while (I) { + if (auto *C = dyn_cast(I)) { + auto *Callee = C->getCalledFunction(); + if (Callee == TgtTargetDecl) + return I; + if (Callee == TgtTargetTeamsDecl) + return I; + if (Callee == TgtTargetDataEndDecl) + return I; + } + + I = I->getNextNode(); + } + + // Return end of BasicBlock. + return &*(RC->getParent()->end()); +} + Value *OpenMPOpt::combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, bool GlobalOnly, bool &SingleChoice) { if (CurrentIdent == NextIdent) Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll =================================================================== --- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll +++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll @@ -1,9 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature -; RUN: opt -S -passes=openmpopt < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes +; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-split-memtransfers < %s | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -; FIXME: This struct should be generated after splitting at least one of the runtime calls. -; %struct.__tgt_async_info = type { i8* } +; CHECK: %struct.__tgt_async_info = type { i8* } %struct.ident_t = type { i32, i32, i32, i32, i8* } %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } @@ -50,14 +49,18 @@ ; CHECK-NEXT: %rem = srem i32 %call, 777 ; CHECK-NEXT: %conv = sitofp i32 %rem to double ; CHECK-NEXT: store double %conv, double* %a, align 8 + ; CHECK-NEXT: %call1 = call i32 @rand() + ; CHECK-NEXT: %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 ; CHECK-NEXT: %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** ; CHECK-NEXT: store double* %a, double** %2, align 8 ; CHECK-NEXT: %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 ; CHECK-NEXT: %4 = bitcast [1 x i8*]* %.offload_ptrs to double** ; CHECK-NEXT: store double* %a, double** %4, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) + +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) + ; CHECK-NEXT: %5 = bitcast double* %a to i64* ; CHECK-NEXT: %6 = load i64, i64* %5, align 8 ; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0 @@ -66,7 +69,10 @@ ; CHECK-NEXT: %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0 ; CHECK-NEXT: %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64* ; CHECK-NEXT: store i64 %6, i64* %10, align 8 -; CHECK-NEXT: %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0) + +; CHECK-NEXT: call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle) + +; CHECK-NEXT: %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nocapture %7, i8** nocapture %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0) ; CHECK-NEXT: %12 = icmp eq i32 %11, 0 ; CHECK-NEXT: br i1 %12, label %omp_offload.cont, label %omp_offload.failed ; CHECK: omp_offload.failed: @@ -86,32 +92,15 @@ %.offload_baseptrs4 = alloca [1 x i8*], align 8 %.offload_ptrs5 = alloca [1 x i8*], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - %0 = bitcast double* %a to i8* %call = call i32 @rand() %rem = srem i32 %call, 777 %conv = sitofp i32 %rem to double store double %conv, double* %a, align 8 - ; FIXME: The "isue" should be moved here. + ; FIXME: call i8* @__tgt_target_data_begin_issue(...) should be moved here. %call1 = call i32 @rand() - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars and moved upwards - ; and downwards, respectively. The call should be replaced to something - ; like ... - ; Issue - this is moved upwards. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) - ; Wait - this is moved downwards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** store double* %a, double** %2, align 8 @@ -129,8 +118,7 @@ %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64* store i64 %6, i64* %10, align 8 - ; FIXME: The "wait" should be moved here. - %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0) + %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nocapture %7, i8** nocapture %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0) %12 = icmp eq i32 %11, 0 br i1 %12, label %omp_offload.cont, label %omp_offload.failed @@ -148,6 +136,10 @@ } define internal void @heavyComputation1FallBack(i64 %a) { +; CHECK-LABEL: define {{[^@]+}}@heavyComputation1FallBack(i64 %a) +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; entry: ; Fallback for offloading function heavyComputation1. ret void @@ -179,7 +171,9 @@ ; CHECK-NEXT: %.offload_baseptrs2 = alloca [2 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs3 = alloca [2 x i8*], align 8 ; CHECK-NEXT: store i32 %size, i32* %size.addr, align 4 + ; CHECK-NEXT: %call = call i32 @rand() + ; CHECK-NEXT: %conv = zext i32 %size to i64 ; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 ; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -198,7 +192,8 @@ ; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8 ; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 ; CHECK-NEXT: store i64 4, i64* %10, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + ; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4 ; CHECK-NEXT: %size.casted = zext i32 %11 to i64 ; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 @@ -213,6 +208,9 @@ ; CHECK-NEXT: %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 ; CHECK-NEXT: %19 = bitcast i8** %18 to double** ; CHECK-NEXT: store double* %a, double** %19, align 8 + +; CHECK-NEXT: call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) ; CHECK-NEXT: %21 = icmp eq i32 %20, 0 ; CHECK-NEXT: br i1 %21, label %omp_offload.cont, label %omp_offload.failed @@ -232,27 +230,9 @@ %.offload_baseptrs2 = alloca [2 x i8*], align 8 %.offload_ptrs3 = alloca [2 x i8*], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - store i32 %size, i32* %size.addr, align 4 %call = call i32 @rand() - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars. Here though, the "issue" - ; cannot be moved upwards because it's not guaranteed that rand() - ; won't modify *a. Nevertheless, the "wait" can be moved downwards. - ; The call should be replaced to something like ... - ; Issue - this can't be moved upwards, *a might have aliases. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) - ; Wait - this is moved downards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) %conv = zext i32 %size to i64 %0 = shl nuw nsw i64 %conv, 3 %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -288,7 +268,6 @@ %19 = bitcast i8** %18 to double** store double* %a, double** %19, align 8 - ; FIXME: The "wait" should be moved here. %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) %21 = icmp eq i32 %20, 0 @@ -305,6 +284,10 @@ } define internal void @heavyComputation2FallBack(i64 %size, double* %a) { +; CHECK-LABEL: define {{[^@]+}}@heavyComputation2FallBack(i64 %size, double* %a) +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; entry: ; Fallback for offloading function heavyComputation2. ret void @@ -336,7 +319,9 @@ ; CHECK-NEXT: %.offload_baseptrs2 = alloca [2 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs3 = alloca [2 x i8*], align 8 ; CHECK-NEXT: store i32 %size, i32* %size.addr, align 4 + ; CHECK-NEXT: %call = call i32 @rand() + ; CHECK-NEXT: %conv = zext i32 %size to i64 ; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 ; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -355,7 +340,8 @@ ; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8 ; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 ; CHECK-NEXT: store i64 4, i64* %10, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + ; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4 ; CHECK-NEXT: %size.casted = zext i32 %11 to i64 ; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 @@ -370,6 +356,9 @@ ; CHECK-NEXT: %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 ; CHECK-NEXT: %19 = bitcast i8** %18 to double** ; CHECK-NEXT: store double* %a, double** %19, align 8 + +; CHECK-NEXT: call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) ; CHECK-NEXT: %21 = icmp eq i32 %20, 0 ; CHECK-NEXT: br i1 %21, label %omp_offload.cont, label %omp_offload.failed @@ -389,28 +378,11 @@ %.offload_baseptrs2 = alloca [2 x i8*], align 8 %.offload_ptrs3 = alloca [2 x i8*], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - store i32 %size, i32* %size.addr, align 4 - ; FIXME: The "issue" should be moved here. + ; FIXME: call i8* @__tgt_target_data_begin_issue(...) should be moved here. %call = call i32 @rand() - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars and moved upwards - ; and downwards, respectively. The call should be replaced to something - ; like ... - ; Issue - this is moved upwards. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) - ; Wait - this is moved downards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) %conv = zext i32 %size to i64 %0 = shl nuw nsw i64 %conv, 3 %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -446,7 +418,6 @@ %19 = bitcast i8** %18 to double** store double* %a, double** %19, align 8 - ; FIXME: The "wait" should be moved here. %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) %21 = icmp eq i32 %20, 0 @@ -463,6 +434,10 @@ } define internal void @heavyComputation3FallBack(i64 %size, double* %a) { +; CHECK-LABEL: define {{[^@]+}}@heavyComputation3FallBack(i64 %size, double* %a) +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; entry: ; Fallback for offloading function heavyComputation3. ret void @@ -487,7 +462,9 @@ ; CHECK-NEXT: %.offload_baseptrs = alloca [1 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs = alloca [1 x i8*], align 8 ; CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 + ; CHECK-NEXT: %call = call i32 @rand() + ; CHECK-NEXT: %conv = zext i32 %size to i64 ; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 ; CHECK-NEXT: %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -498,8 +475,12 @@ ; CHECK-NEXT: store double* %a, double** %4, align 8 ; CHECK-NEXT: %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0 ; CHECK-NEXT: store i64 %0, i64* %5, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) + ; CHECK-NEXT: %rem = urem i32 %call, %size + +; CHECK-NEXT: call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) ; CHECK-NEXT: ret i32 %rem ; @@ -508,26 +489,9 @@ %.offload_ptrs = alloca [1 x i8*], align 8 %.offload_sizes = alloca [1 x i64], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - - ; FIXME: The "issue" should be moved here. + ; FIXME: call i8* @__tgt_target_data_begin_issue(...) should be moved here. %call = call i32 @rand() - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars and moved upwards - ; and downwards, respectively. The call should be replaced to something - ; like ... - ; Issue - this is moved upwards. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) - ; Wait - this is moved downards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) %conv = zext i32 %size to i64 %0 = shl nuw nsw i64 %conv, 3 %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -542,7 +506,6 @@ %rem = urem i32 %call, %size - ; FIXME: The "wait" should be moved here. call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) ret i32 %rem } @@ -555,7 +518,6 @@ declare dso_local i32 @rand() -; FIXME: These two function declarations must be generated after splitting the runtime function -; __tgt_target_data_begin. -; declare dso_local i8* @__tgt_target_data_begin_issue(i64* dereferenceable(8), i32, i8**, i8**, i64*, i64*) -; declare dso_local void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info* dereferenceable(8)) +; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64, i32, i8**, i8**, i64*, i64*) +; CHECK: declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) +