diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -689,7 +689,12 @@ if (!RTCall) return false; - bool WasSplit = splitTargetDataBeginRTC(RTCall); + // TODO: Check if can be moved upwards. + bool WasSplit = false; + Instruction *WaitMovementPoint = canBeMovedDownwards(RTCall); + if (WaitMovementPoint) + WasSplit = splitTargetDataBeginRTC(RTCall, WaitMovementPoint); + Changed |= WasSplit; return WasSplit; }; @@ -698,8 +703,54 @@ return Changed; } + /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be + /// moved. Returns nullptr if the movement is not possible, or not worth it. + Instruction *canBeMovedDownwards(CallInst *RuntimeCall) { + // FIXME: This traverses only the BasicBlock where RuntimeCall is. + // Make it traverse the CFG. + + // Functions that may require the data transferred or may synchronize it. + auto *TargetTeams = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunctionPtr( + llvm::omp::RuntimeFunction::OMPRTL___tgt_target_teams_mapper); + auto *TargetDataEnd = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunctionPtr( + llvm::omp::RuntimeFunction::OMPRTL___tgt_target_data_end_mapper); + + Instruction *CurrentI = RuntimeCall; + bool IsWorthIt = false; + while ((CurrentI = CurrentI->getNextNode())) { + + // TODO: Once we detect the regions to be offloaded we should use the + // alyas analysis manager to check if CurrentI may modify one of + // the offloaded regions. + if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { + if (IsWorthIt) + return CurrentI; + return nullptr; + } + + if (auto *C = dyn_cast(CurrentI)) { + auto *Callee = C->getCalledFunction(); + if (Callee == TargetTeams || Callee == TargetDataEnd) { + if (IsWorthIt) + return CurrentI; + return nullptr; + } + } + + // FIXME: For now if we move it over anything without side effect + // is worth it. + IsWorthIt = true; + } + + // Return end of BasicBlock. + return &*(--RuntimeCall->getParent()->end()); +} + /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. - bool splitTargetDataBeginRTC(CallInst *RuntimeCall) { + bool splitTargetDataBeginRTC(CallInst *RuntimeCall, + Instruction *WaitMovementPoint) { + assert(WaitMovementPoint && "No place to move the split runtime call!"); + auto &IRBuilder = OMPInfoCache.OMPBuilder; // Add "issue" runtime call declaration: // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, @@ -726,8 +777,8 @@ IssueCallsite->getArgOperand(0), // device_id. IssueCallsite // returned handle. }; - CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", - IssueCallsite->getNextNode()); + CallInst::Create( + WaitDecl, WaitParams, /*NameStr=*/"", WaitMovementPoint); return true; } diff --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll --- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll +++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll @@ -59,9 +59,11 @@ ; CHECK-NEXT: store double* %a, double** %4, align 8 ; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null) -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) ; CHECK-NEXT: %5 = bitcast double* %a to i64* + +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %6 = load i64, i64* %5, align 8 ; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0 ; CHECK-NEXT: %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64* @@ -184,8 +186,7 @@ ; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 ; CHECK-NEXT: store i64 4, i64* %10, align 8 -; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null) -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null) ; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4 ; CHECK-NEXT: %size.casted = zext i32 %11 to i64 @@ -325,8 +326,7 @@ ; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 ; CHECK-NEXT: store i64 4, i64* %10, align 8 -; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null) -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null) ; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4 ; CHECK-NEXT: %size.casted = zext i32 %11 to i64 @@ -453,9 +453,11 @@ ; CHECK-NEXT: store i64 %0, i64* %5, align 8 ; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null) -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) ; CHECK-NEXT: %rem = urem i32 %call, %size + +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null) ; CHECK-NEXT: ret i32 %rem ;