Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -198,6 +198,7 @@ OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__) __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr) +__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr) #undef __OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE @@ -482,6 +483,9 @@ VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr) __OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, Int64, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr) +__OMP_RTL(__tgt_target_data_begin_mapper_issue, false, AsyncInfo, Int64, Int32, + VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr) +__OMP_RTL(__tgt_target_data_begin_mapper_wait, false, Void, Int64, AsyncInfo) __OMP_RTL(__tgt_target_data_end_mapper, false, Void, Int64, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr) __OMP_RTL(__tgt_target_data_end_nowait_mapper, false, Void, Int64, Int32, Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -42,6 +42,12 @@ static cl::opt PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden); +static cl::opt SplitMemoryTransfers( + "openmp-split-memtransfers", + cl::desc("Tries to hide the latency of host to device memory transfers"), + cl::Hidden, cl::init(false)); + + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -508,6 +514,8 @@ Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + if (SplitMemoryTransfers) + Changed |= hideMemTransfersLatency(); return Changed; } @@ -666,6 +674,66 @@ return Changed; } + /// Tries to hide the latency of runtime calls that involve host to + /// device memory transfers by splitting them into their "issue" and "wait" + /// versions. The "issue" is moved upwards as much as possible. The "wait" is + /// moved downards as much as possible. The "issue" issues the memory transfer + /// asynchronously, returning a handle. The "wait" waits in the returned + /// handle for the memory transfer to finish. + bool hideMemTransfersLatency() { + auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; + bool Changed = false; + auto SplitMemTransfers = [&](Use &U, Function &Decl) { + auto *RTCall = getCallIfRegularCall(U, &RFI); + if (!RTCall) + return false; + + Changed |= split(RTCall); + return Changed; + }; + RFI.foreachUse(SCC, SplitMemTransfers); + + return Changed; + } + + /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. + bool split(CallInst *RuntimeCall) { + auto &IRBuilder = OMPInfoCache.OMPBuilder; + // Add "issue" runtime call declaration. + // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, + // i8**, i8**, i64*, i64*) + FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper_issue); + + // Change RuntimeCall callsite for its asynchronous version. + SmallVector Args; + Args.reserve(RuntimeCall->getNumArgOperands()); + for (auto &Arg : RuntimeCall->args()) + Args.push_back(Arg.get()); + + CallInst *IssueCallsite = CallInst::Create( + IssueDecl, ArrayRef(Args), "handle", RuntimeCall); + RuntimeCall->removeFromParent(); + RuntimeCall->deleteValue(); + + // Add "wait" runtime call declaration. + // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) + FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper_wait); + + // Add "wait" call site. + const unsigned WaitNumParams = 2; + Value *WaitParams[] = { + IssueCallsite->getArgOperand(0), // device_id. + IssueCallsite // returned handle. + }; + CallInst::Create( + WaitDecl, ArrayRef(WaitParams, WaitNumParams), /*NameStr=*/"", + IssueCallsite->getNextNode()); + + return true; + } + static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, bool GlobalOnly, bool &SingleChoice) { if (CurrentIdent == NextIdent) Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll =================================================================== --- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll +++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll @@ -1,25 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature -; RUN: opt -S -passes=openmpopt < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature --scrub-attributes +; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-split-memtransfers < %s | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -; FIXME: This struct should be generated after splitting at least one of the runtime calls. -; %struct.__tgt_async_info = type { i8* } +; CHECK: %struct.__tgt_async_info = type { i8* } + %struct.ident_t = type { i32, i32, i32, i32, i8* } %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] -@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0 +@.__omp_offloading_heavyComputation1_region_id = weak constant i8 0 @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8] @.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800] - -@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0 -@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35] - -@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0 -@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0] -@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544] - -@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33] +@.__omp_offloading_heavyComputation2_region_id = weak constant i8 0 +@.offload_maptypes.7 = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.__omp_offloading_heavyComputation3_region_id = weak constant i8 0 +@.offload_sizes.9 = private unnamed_addr constant [2 x i64] [i64 4, i64 0] +@.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 544] +@.offload_maptypes.11 = private unnamed_addr constant [1 x i64] [i64 33] ;double heavyComputation1() { ; double a = rand() % 777; @@ -46,18 +43,21 @@ ; CHECK-NEXT: %.offload_baseptrs4 = alloca [1 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs5 = alloca [1 x i8*], align 8 ; CHECK-NEXT: %0 = bitcast double* %a to i8* -; CHECK-NEXT: %call = call i32 @rand() +; CHECK-NEXT: %call = tail call i32 (...) @rand() ; CHECK-NEXT: %rem = srem i32 %call, 777 ; CHECK-NEXT: %conv = sitofp i32 %rem to double ; CHECK-NEXT: store double %conv, double* %a, align 8 -; CHECK-NEXT: %call1 = call i32 @rand() +; CHECK-NEXT: %call1 = tail call i32 (...) @rand() ; CHECK-NEXT: %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 ; CHECK-NEXT: %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** ; CHECK-NEXT: store double* %a, double** %2, align 8 ; CHECK-NEXT: %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 ; CHECK-NEXT: %4 = bitcast [1 x i8*]* %.offload_ptrs to double** ; CHECK-NEXT: store double* %a, double** %4, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) + +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null) +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %5 = bitcast double* %a to i64* ; CHECK-NEXT: %6 = load i64, i64* %5, align 8 ; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0 @@ -66,17 +66,17 @@ ; CHECK-NEXT: %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0 ; CHECK-NEXT: %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64* ; CHECK-NEXT: store i64 %6, i64* %10, align 8 -; CHECK-NEXT: %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0) -; CHECK-NEXT: %12 = icmp eq i32 %11, 0 -; CHECK-NEXT: br i1 %12, label %omp_offload.cont, label %omp_offload.failed +; CHECK-NEXT: %11 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1_region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i32 0, i32 0) +; CHECK-NEXT: %.not = icmp eq i32 %11, 0 +; CHECK-NEXT: br i1 %.not, label %omp_offload.cont, label %omp_offload.failed ; CHECK: omp_offload.failed: ; CHECK-NEXT: call void @heavyComputation1FallBack(i64 %6) ; CHECK-NEXT: br label %omp_offload.cont ; CHECK: omp_offload.cont: ; CHECK-NEXT: %conv2 = sitofp i32 %call1 to double -; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) -; CHECK-NEXT: %13 = load double, double* %a, align 8 -; CHECK-NEXT: %add = fadd double %13, %conv2 +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null) +; CHECK-NEXT: %12 = load double, double* %a, align 8 +; CHECK-NEXT: %add = fadd double %12, %conv2 ; CHECK-NEXT: ret double %add ; entry: @@ -86,39 +86,22 @@ %.offload_baseptrs4 = alloca [1 x i8*], align 8 %.offload_ptrs5 = alloca [1 x i8*], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - %0 = bitcast double* %a to i8* - %call = call i32 @rand() + %call = tail call i32 (...) @rand() %rem = srem i32 %call, 777 %conv = sitofp i32 %rem to double store double %conv, double* %a, align 8 - ; FIXME: The "isue" should be moved here. - %call1 = call i32 @rand() - - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars and moved upwards - ; and downwards, respectively. The call should be replaced to something - ; like ... - ; Issue - this is moved upwards. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) - ; Wait - this is moved downwards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) + ; FIXME: call i8* @__tgt_target_data_begin_mapper_issue(...) should be moved here. + %call1 = tail call i32 (...) @rand() + %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** store double* %a, double** %2, align 8 %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 %4 = bitcast [1 x i8*]* %.offload_ptrs to double** store double* %a, double** %4, align 8 - call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) + call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null) %5 = bitcast double* %a to i64* %6 = load i64, i64* %5, align 8 @@ -129,21 +112,20 @@ %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64* store i64 %6, i64* %10, align 8 - ; FIXME: The "wait" should be moved here. - %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0) - - %12 = icmp eq i32 %11, 0 - br i1 %12, label %omp_offload.cont, label %omp_offload.failed + ; FIXME: call i8* @__tgt_target_data_begin_mapper_wait(...) should be moved here. + %11 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1_region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i32 0, i32 0) + %.not = icmp eq i32 %11, 0 + br i1 %.not, label %omp_offload.cont, label %omp_offload.failed omp_offload.failed: ; preds = %entry call void @heavyComputation1FallBack(i64 %6) br label %omp_offload.cont -omp_offload.cont: ; preds = %entry, %omp_offload.failed +omp_offload.cont: ; preds = %omp_offload.failed, %entry %conv2 = sitofp i32 %call1 to double - call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0)) - %13 = load double, double* %a, align 8 - %add = fadd double %13, %conv2 + call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null) + %12 = load double, double* %a, align 8 + %add = fadd double %12, %conv2 ret double %add } @@ -179,7 +161,7 @@ ; CHECK-NEXT: %.offload_baseptrs2 = alloca [2 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs3 = alloca [2 x i8*], align 8 ; CHECK-NEXT: store i32 %size, i32* %size.addr, align 4 -; CHECK-NEXT: %call = call i32 @rand() +; CHECK-NEXT: %call = tail call i32 (...) @rand() ; CHECK-NEXT: %conv = zext i32 %size to i64 ; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 ; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -198,7 +180,10 @@ ; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8 ; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 ; CHECK-NEXT: store i64 4, i64* %10, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4 ; CHECK-NEXT: %size.casted = zext i32 %11 to i64 ; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 @@ -213,15 +198,15 @@ ; CHECK-NEXT: %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 ; CHECK-NEXT: %19 = bitcast i8** %18 to double** ; CHECK-NEXT: store double* %a, double** %19, align 8 -; CHECK-NEXT: %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) -; CHECK-NEXT: %21 = icmp eq i32 %20, 0 -; CHECK-NEXT: br i1 %21, label %omp_offload.cont, label %omp_offload.failed +; CHECK-NEXT: %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2_region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i64 0, i64 0), i8** null, i32 0, i32 0) +; CHECK-NEXT: %.not = icmp eq i32 %20, 0 +; CHECK-NEXT: br i1 %.not, label %omp_offload.cont, label %omp_offload.failed ; CHECK: omp_offload.failed: ; CHECK-NEXT: call void @heavyComputation2FallBack(i64 %size.casted, double* %a) ; CHECK-NEXT: br label %omp_offload.cont ; CHECK: omp_offload.cont: ; CHECK-NEXT: %rem = srem i32 %call, 7 -; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) ; CHECK-NEXT: ret i32 %rem ; entry: @@ -232,27 +217,9 @@ %.offload_baseptrs2 = alloca [2 x i8*], align 8 %.offload_ptrs3 = alloca [2 x i8*], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - store i32 %size, i32* %size.addr, align 4 - %call = call i32 @rand() - - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars. Here though, the "issue" - ; cannot be moved upwards because it's not guaranteed that rand() - ; won't modify *a. Nevertheless, the "wait" can be moved downwards. - ; The call should be replaced to something like ... - ; Issue - this can't be moved upwards, *a might have aliases. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) - ; Wait - this is moved downards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) + %call = tail call i32 (...) @rand() + %conv = zext i32 %size to i64 %0 = shl nuw nsw i64 %conv, 3 %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -271,7 +238,7 @@ store i32* %size.addr, i32** %9, align 8 %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 store i64 4, i64* %10, align 8 - call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) %11 = load i32, i32* %size.addr, align 4 %size.casted = zext i32 %11 to i64 @@ -288,19 +255,18 @@ %19 = bitcast i8** %18 to double** store double* %a, double** %19, align 8 - ; FIXME: The "wait" should be moved here. - %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) - - %21 = icmp eq i32 %20, 0 - br i1 %21, label %omp_offload.cont, label %omp_offload.failed + ; FIXME: call i8* @__tgt_target_data_begin_mapper_wait(...) should be moved here. + %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2_region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i64 0, i64 0), i8** null, i32 0, i32 0) + %.not = icmp eq i32 %20, 0 + br i1 %.not, label %omp_offload.cont, label %omp_offload.failed omp_offload.failed: ; preds = %entry call void @heavyComputation2FallBack(i64 %size.casted, double* %a) br label %omp_offload.cont -omp_offload.cont: ; preds = %entry, %omp_offload.failed +omp_offload.cont: ; preds = %omp_offload.failed, %entry %rem = srem i32 %call, 7 - call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) ret i32 %rem } @@ -336,7 +302,7 @@ ; CHECK-NEXT: %.offload_baseptrs2 = alloca [2 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs3 = alloca [2 x i8*], align 8 ; CHECK-NEXT: store i32 %size, i32* %size.addr, align 4 -; CHECK-NEXT: %call = call i32 @rand() +; CHECK-NEXT: %call = tail call i32 (...) @rand() ; CHECK-NEXT: %conv = zext i32 %size to i64 ; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 ; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -355,7 +321,10 @@ ; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8 ; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 ; CHECK-NEXT: store i64 4, i64* %10, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4 ; CHECK-NEXT: %size.casted = zext i32 %11 to i64 ; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 @@ -370,15 +339,15 @@ ; CHECK-NEXT: %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 ; CHECK-NEXT: %19 = bitcast i8** %18 to double** ; CHECK-NEXT: store double* %a, double** %19, align 8 -; CHECK-NEXT: %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) -; CHECK-NEXT: %21 = icmp eq i32 %20, 0 -; CHECK-NEXT: br i1 %21, label %omp_offload.cont, label %omp_offload.failed +; CHECK-NEXT: %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3_region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i64 0, i64 0), i8** null, i32 0, i32 0) +; CHECK-NEXT: %.not = icmp eq i32 %20, 0 +; CHECK-NEXT: br i1 %.not, label %omp_offload.cont, label %omp_offload.failed ; CHECK: omp_offload.failed: ; CHECK-NEXT: call void @heavyComputation3FallBack(i64 %size.casted, double* %a) ; CHECK-NEXT: br label %omp_offload.cont ; CHECK: omp_offload.cont: ; CHECK-NEXT: %rem = srem i32 %call, 7 -; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) ; CHECK-NEXT: ret i32 %rem ; entry: @@ -388,29 +357,11 @@ %.offload_sizes = alloca [2 x i64], align 8 %.offload_baseptrs2 = alloca [2 x i8*], align 8 %.offload_ptrs3 = alloca [2 x i8*], align 8 - - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - store i32 %size, i32* %size.addr, align 4 - ; FIXME: The "issue" should be moved here. - %call = call i32 @rand() - - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars and moved upwards - ; and downwards, respectively. The call should be replaced to something - ; like ... - ; Issue - this is moved upwards. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) - ; Wait - this is moved downards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) + ; FIXME: call i8* @__tgt_target_data_begin_mapper_issue(...) should be moved here. + %call = tail call i32 (...) @rand() + %conv = zext i32 %size to i64 %0 = shl nuw nsw i64 %conv, 3 %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -429,7 +380,7 @@ store i32* %size.addr, i32** %9, align 8 %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 store i64 4, i64* %10, align 8 - call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) %11 = load i32, i32* %size.addr, align 4 %size.casted = zext i32 %11 to i64 @@ -446,19 +397,18 @@ %19 = bitcast i8** %18 to double** store double* %a, double** %19, align 8 - ; FIXME: The "wait" should be moved here. - %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0) - - %21 = icmp eq i32 %20, 0 - br i1 %21, label %omp_offload.cont, label %omp_offload.failed + ; FIXME: call i8* @__tgt_target_data_begin_mapper_wait(...) should be moved here. + %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3_region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i64 0, i64 0), i8** null, i32 0, i32 0) + %.not = icmp eq i32 %20, 0 + br i1 %.not, label %omp_offload.cont, label %omp_offload.failed omp_offload.failed: ; preds = %entry call void @heavyComputation3FallBack(i64 %size.casted, double* %a) br label %omp_offload.cont -omp_offload.cont: ; preds = %entry, %omp_offload.failed +omp_offload.cont: ; preds = %omp_offload.failed, %entry %rem = srem i32 %call, 7 - call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0)) + call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.7, i64 0, i64 0), i8** null) ret i32 %rem } @@ -487,7 +437,7 @@ ; CHECK-NEXT: %.offload_baseptrs = alloca [1 x i8*], align 8 ; CHECK-NEXT: %.offload_ptrs = alloca [1 x i8*], align 8 ; CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 -; CHECK-NEXT: %call = call i32 @rand() +; CHECK-NEXT: %call = tail call i32 (...) @rand() ; CHECK-NEXT: %conv = zext i32 %size to i64 ; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3 ; CHECK-NEXT: %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -498,9 +448,12 @@ ; CHECK-NEXT: store double* %a, double** %4, align 8 ; CHECK-NEXT: %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0 ; CHECK-NEXT: store i64 %0, i64* %5, align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) + +; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.11, i64 0, i64 0), i8** null) +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle) + ; CHECK-NEXT: %rem = urem i32 %call, %size -; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.11, i64 0, i64 0), i8** null) ; CHECK-NEXT: ret i32 %rem ; entry: @@ -508,26 +461,9 @@ %.offload_ptrs = alloca [1 x i8*], align 8 %.offload_sizes = alloca [1 x i64], align 8 - ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin. - ; %device_id1 = alloca i64, align 8 - ; %async_info1 = alloca %struct.__tgt_async_info, align 8 - - ; FIXME: The "issue" should be moved here. - %call = call i32 @rand() - - ; FIXME: This setup for the runtime call __tgt_target_data_begin should be - ; split into its "issue" and "wait" counterpars and moved upwards - ; and downwards, respectively. The call should be replaced to something - ; like ... - ; Issue - this is moved upwards. - ; ... setup code ... - ; store i64 -1, i64* %device_id1, align 8 - ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) - ; Wait - this is moved downards. - ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id - ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0 - ; store i8* %handle1, i8** %queue1, align 8 - ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1) + ; FIXME: call i8* @__tgt_target_data_begin_issue_mapper(...) should be moved here. + %call = tail call i32 (...) @rand() + %conv = zext i32 %size to i64 %0 = shl nuw nsw i64 %conv, 3 %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 @@ -538,24 +474,19 @@ store double* %a, double** %4, align 8 %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0 store i64 %0, i64* %5, align 8 - call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) + call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.11, i64 0, i64 0), i8** null) %rem = urem i32 %call, %size - ; FIXME: The "wait" should be moved here. - call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0)) + call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.11, i64 0, i64 0), i8** null) ret i32 %rem } -declare dso_local void @__tgt_target_data_begin(i64, i32, i8**, i8**, i64*, i64*) - -declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) - -declare dso_local void @__tgt_target_data_end(i64, i32, i8**, i8**, i64*, i64*) +; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**) +; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info) -declare dso_local i32 @rand() +declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) +declare i32 @__tgt_target_teams_mapper(i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i32, i32) +declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**) -; FIXME: These two function declarations must be generated after splitting the runtime function -; __tgt_target_data_begin. -; declare dso_local i8* @__tgt_target_data_begin_issue(i64* dereferenceable(8), i32, i8**, i8**, i64*, i64*) -; declare dso_local void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info* dereferenceable(8)) +declare dso_local i32 @rand(...)