Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -286,6 +286,7 @@
   OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__)
 
 __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr)
+__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr)
 
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
@@ -570,6 +571,9 @@
           VoidPtrPtr, Int64Ptr, Int64Ptr)
 __OMP_RTL(__tgt_target_data_begin_nowait, false, Void, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr)
+__OMP_RTL(__tgt_target_data_begin_issue, false, AsyncInfo, Int64, Int32, VoidPtrPtr,
+          VoidPtrPtr, Int64Ptr, Int64Ptr)
+__OMP_RTL(__tgt_target_data_begin_wait, false, Void, Int64, AsyncInfo)
 __OMP_RTL(__tgt_target_data_end, false, Void, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr)
 __OMP_RTL(__tgt_target_data_end_nowait, false, Void, Int64, Int32, VoidPtrPtr,
Index: llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
===================================================================
--- llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
+++ llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
@@ -175,20 +175,23 @@
       bool isFilled();
     };
 
-    CallBase *RuntimeCall; /// Call that involves a memotry transfer.
-    InformationCache &InfoCache;
+    CallInst *RuntimeCall; /// Call that involves a memotry transfer.
+    OMPInformationCache &InfoCache;
 
     /// These help mapping the values in offload_baseptrs, offload_ptrs, and
     /// offload_sizes, respectively.
+    const unsigned BasePtrsArgNum = 2;
     std::unique_ptr<OffloadArray> BasePtrs = nullptr;
+    const unsigned PtrsArgNum = 3;
     std::unique_ptr<OffloadArray> Ptrs = nullptr;
+    const unsigned SizesArgNum = 4;
     std::unique_ptr<OffloadArray> Sizes = nullptr;
 
     /// Set of instructions that compose the argument setup for the call
     /// RuntimeCall.
     SetVector<Instruction *> Issue;
 
-    MemoryTransfer(CallBase *RuntimeCall, InformationCache &InfoCache) :
+    MemoryTransfer(CallInst *RuntimeCall, OMPInformationCache &InfoCache) :
         RuntimeCall{RuntimeCall}, InfoCache{InfoCache}
     {}
 
@@ -207,6 +210,11 @@
     /// offload arrays.
     bool mayBeModifiedBy(Instruction *I);
 
+    /// Splits this object into its "issue" and "wait" corresponding runtime
+    /// calls. The "issue" is moved after \p After and the "wait" is moved
+    /// before \p Before.
+    bool split(Instruction *After, Instruction *Before);
+
   private:
     /// Gets the setup instructions for each of the values in \p OA. These
     /// instructions are stored into Issue.
@@ -218,6 +226,10 @@
 
     /// Returns true if \p I may modify one of the values in \p Values.
     bool mayModify(Instruction *I, SmallVectorImpl<Value *> &Values);
+
+    /// Removes from the function all the instructions in Issue and inserts
+    /// them after \p After.
+    void moveIssue(Instruction *After);
   };
 
   /// The slice of the module we are allowed to look at.
@@ -301,6 +313,10 @@
   /// moved. Returns nullptr if the movement is not possible, or not worth it.
   Instruction *canBeMovedUpwards(MemoryTransfer &MT);
 
+  /// Returns a pointer to the instruction where the "wait" of \p MT can be
+  /// moved. Returns nullptr if the movement is not possible, or not worth it.
+  Instruction *canBeMovedDownwards(MemoryTransfer &MT);
+
   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                     bool GlobalOnly, bool &SingleChoice);
 
Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -44,6 +44,11 @@
 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
                                     cl::Hidden);
 
+static cl::opt<bool> SplitMemoryTransfers(
+    "openmp-split-memtransfers",
+    cl::desc("Tries to hide the latency of host to device memory transfers"),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -253,13 +258,10 @@
   // arrays, offload_baseptrs, offload_ptrs, offload_sizes.
   // Therefore:
   // i8** %offload_baseptrs.
-  const unsigned BasePtrsArgNum = 2;
   Use *BasePtrsArg = RuntimeCall->arg_begin() + BasePtrsArgNum;
   // i8** %offload_ptrs.
-  const unsigned PtrsArgNum = 3;
   Use *PtrsArg = RuntimeCall->arg_begin() + PtrsArgNum;
   // i8** %offload_sizes.
-  const unsigned SizesArgNum = 4;
   Use *SizesArg = RuntimeCall->arg_begin() + SizesArgNum;
 
   const DataLayout &DL = InfoCache.getDL();
@@ -337,6 +339,9 @@
                       << RuntimeCall->getCaller()->getName() << "\n");
     return false;
   }
+  auto *BasePtrsGEP =
+      cast<Instruction>(RuntimeCall->getArgOperand(BasePtrsArgNum));
+  Issue.insert(BasePtrsGEP);
 
   Success = getSetupInstructions(Ptrs);
   if (!Success) {
@@ -346,6 +351,9 @@
                       << RuntimeCall->getCaller()->getName() << "\n");
     return false;
   }
+  auto *PtrsGEP =
+      cast<Instruction>(RuntimeCall->getArgOperand(PtrsArgNum));
+  Issue.insert(PtrsGEP);
 
   if (Sizes) {
     Success = getSetupInstructions(Sizes);
@@ -356,6 +364,9 @@
                         << RuntimeCall->getCaller()->getName() << "\n");
       return false;
     }
+    auto *SizesGEP =
+        cast<Instruction>(RuntimeCall->getArgOperand(SizesArgNum));
+    Issue.insert(SizesGEP);
   }
 
   return true;
@@ -495,6 +506,65 @@
   return true;
 }
 
+bool MemoryTransfer::split(Instruction *After, Instruction *Before) {
+  assert((After || Before) &&
+         "Must have a place to move the split runtime call");
+
+  auto *M = RuntimeCall->getModule();
+  auto &IRBuilder = InfoCache.OMPBuilder;
+  // Add "issue" runtime call declaration.
+  // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
+  //   i8**, i8**, i64*, i64*)
+  FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+      *M, OMPRTL___tgt_target_data_begin_issue);
+
+  // Change RuntimeCall callsite for its asynchronous version.
+  std::vector<Value *> Args;
+  Args.reserve(RuntimeCall->getNumArgOperands());
+  for (auto &Arg : RuntimeCall->args())
+    Args.push_back(Arg.get());
+
+  CallInst *IssueCallsite = CallInst::Create(
+      IssueDecl, ArrayRef<Value *>(Args), "handle", RuntimeCall);
+  RuntimeCall->removeFromParent();
+  RuntimeCall->deleteValue();
+  Issue.insert(IssueCallsite);
+
+  // Add "wait" runtime call declaration.
+  // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+  FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+      *M, OMPRTL___tgt_target_data_begin_wait);
+
+  // Add "wait" call site.
+  const unsigned WaitNumParams = 2;
+  Value *WaitParams[] = {
+      IssueCallsite->getArgOperand(0), // device_id.
+      IssueCallsite // returned handle.
+  };
+  CallInst *WaitCallsite = CallInst::Create(
+      WaitDecl, ArrayRef<Value*>(WaitParams, WaitNumParams), /*NameStr=*/"",
+          /*InsertBefore=*/(Instruction *)nullptr);
+
+  // Move wait.
+  if (!Before)
+    WaitCallsite->insertAfter(IssueCallsite);
+  else
+    WaitCallsite->insertBefore(Before);
+
+  if (After)
+    moveIssue(After);
+
+  return true;
+}
+
+void MemoryTransfer::moveIssue(Instruction *After) {
+  Instruction *Before = After->getNextNode();
+  for (auto *I : Issue) {
+    I->removeFromParent();
+    I->insertBefore(Before);
+  }
+}
+
 std::unique_ptr<OffloadArray> OffloadArray::initialize(
     AllocaInst &Array, Instruction &Before, InformationCache &InfoCache) {
   if (!Array.getAllocatedType()->isArrayTy()) {
@@ -802,7 +872,8 @@
   Changed |= runAttributor();
   Changed |= deduplicateRuntimeCalls();
   Changed |= deleteParallelRegions();
-  Changed |= hideMemTransfersLatency();
+  if (SplitMemoryTransfers)
+    Changed |= hideMemTransfersLatency();
 
   return Changed;
 }
@@ -945,10 +1016,9 @@
       return false;
     }
 
-    if (auto *I = canBeMovedUpwards(MT)) {
-      // TODO: Split call and move "issue" below I.
-    }
-    return false;
+    auto *After = canBeMovedUpwards(MT);
+    auto *Before = canBeMovedDownwards(MT);
+    return (After || Before) && MT.split(After, Before);
   };
 
   RFI.foreachUse(SplitDataTransfer);
@@ -958,7 +1028,7 @@
 Instruction *OpenMPOpt::canBeMovedUpwards(MemoryTransfer &MT) {
   assert(MT.Issue.size() > 0 && "There's not set of instructions to be moved!");
 
-  CallBase *RC = MT.RuntimeCall;
+  CallInst *RC = MT.RuntimeCall;
   auto *MSSAResult =
       OMPInfoCache.getAnalysisResultForFunction<MemorySSAAnalysis>(
           *RC->getCaller());
@@ -978,8 +1048,13 @@
       continue;
 
     auto *MemInst = (cast<MemoryDef>(MemAccess))->getMemoryInst();
-    if (MT.mayBeModifiedBy(MemInst))
-      return MemInst;
+    if (MT.mayBeModifiedBy(MemInst)) {
+      // If MemInst is not the instruction immediately before the Issue.
+      if (!MT.Issue.count(MemInst->getNextNode()))
+        return MemInst;
+
+      return nullptr;
+    }
 
     MemAccess = MSSAWalker->getClobberingMemoryAccess(MemAccess);
   }
@@ -987,6 +1062,34 @@
   return nullptr;
 }
 
+Instruction *OpenMPOpt::canBeMovedDownwards(MemoryTransfer &MT) {
+  assert(MT.Issue.size() > 0 && "There's not set of instructions to be moved!");
+
+  // FIXME: This traverses only the BasicBlock where MT is. Make it traverse
+  //        the CFG.
+  GlobalValue *TgtTargetDecl = M.getNamedValue("__tgt_target");
+  GlobalValue *TgtTargetTeamsDecl = M.getNamedValue("__tgt_target_teams");
+  GlobalValue *TgtTargetDataEndDecl = M.getNamedValue("__tgt_target_data_end");
+  CallInst *RC = MT.RuntimeCall;
+  auto *I = RC->getNextNode();
+  while (I) {
+    if (auto *C = dyn_cast<CallInst>(I)) {
+      auto *Callee = C->getCalledFunction();
+      if (Callee == TgtTargetDecl)
+        return I;
+      if (Callee == TgtTargetTeamsDecl)
+        return I;
+      if (Callee == TgtTargetDataEndDecl)
+        return I;
+    }
+
+    I = I->getNextNode();
+  }
+
+  // Return end of BasicBlock.
+  return &*(RC->getParent()->end());
+}
+
 Value *OpenMPOpt::combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
     bool GlobalOnly, bool &SingleChoice) {
   if (CurrentIdent == NextIdent)
Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
===================================================================
--- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,9 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
-; RUN: opt -S -passes=openmpopt < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-split-memtransfers < %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
-; FIXME: This struct should be generated after splitting at least one of the runtime calls.
-; %struct.__tgt_async_info = type { i8* }
+; CHECK: %struct.__tgt_async_info = type { i8* }
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
 
@@ -50,14 +49,18 @@
 ; CHECK-NEXT:    %rem = srem i32 %call, 777
 ; CHECK-NEXT:    %conv = sitofp i32 %rem to double
 ; CHECK-NEXT:    store double %conv, double* %a, align 8
+
 ; CHECK-NEXT:    %call1 = call i32 @rand()
+
 ; CHECK-NEXT:    %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
 ; CHECK-NEXT:    %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
 ; CHECK-NEXT:    store double* %a, double** %2, align 8
 ; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
 ; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
+
 ; CHECK-NEXT:    %5 = bitcast double* %a to i64*
 ; CHECK-NEXT:    %6 = load i64, i64* %5, align 8
 ; CHECK-NEXT:    %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
@@ -66,7 +69,10 @@
 ; CHECK-NEXT:    %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0
 ; CHECK-NEXT:    %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
 ; CHECK-NEXT:    store i64 %6, i64* %10, align 8
-; CHECK-NEXT:    %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
+
+; CHECK-NEXT:    call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+
+; CHECK-NEXT:    %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nocapture %7, i8** nocapture %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
 ; CHECK-NEXT:    %12 = icmp eq i32 %11, 0
 ; CHECK-NEXT:    br i1 %12, label %omp_offload.cont, label %omp_offload.failed
 ; CHECK:       omp_offload.failed:
@@ -86,32 +92,15 @@
   %.offload_baseptrs4 = alloca [1 x i8*], align 8
   %.offload_ptrs5 = alloca [1 x i8*], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
   %0 = bitcast double* %a to i8*
   %call = call i32 @rand()
   %rem = srem i32 %call, 777
   %conv = sitofp i32 %rem to double
   store double %conv, double* %a, align 8
 
-  ; FIXME: The "isue" should be moved here.
+  ; FIXME: call i8* @__tgt_target_data_begin_issue(...) should be moved here.
   %call1 = call i32 @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. The call should be replaced to something
-  ;        like ...
-  ; Issue - this is moved upwards.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
-  ; Wait - this is moved downwards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
   %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
   store double* %a, double** %2, align 8
@@ -129,8 +118,7 @@
   %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
   store i64 %6, i64* %10, align 8
 
-  ; FIXME: The "wait" should be moved here.
-  %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
+  %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nocapture %7, i8** nocapture %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
 
   %12 = icmp eq i32 %11, 0
   br i1 %12, label %omp_offload.cont, label %omp_offload.failed
@@ -148,6 +136,10 @@
 }
 
 define internal void @heavyComputation1FallBack(i64 %a) {
+; CHECK-LABEL: define {{[^@]+}}@heavyComputation1FallBack(i64 %a)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ; Fallback for offloading function heavyComputation1.
   ret void
@@ -179,7 +171,9 @@
 ; CHECK-NEXT:    %.offload_baseptrs2 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs3 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    store i32 %size, i32* %size.addr, align 4
+
 ; CHECK-NEXT:    %call = call i32 @rand()
+
 ; CHECK-NEXT:    %conv = zext i32 %size to i64
 ; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
 ; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -198,7 +192,8 @@
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -213,6 +208,9 @@
 ; CHECK-NEXT:    %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
 ; CHECK-NEXT:    %19 = bitcast i8** %18 to double**
 ; CHECK-NEXT:    store double* %a, double** %19, align 8
+
+; CHECK-NEXT:    call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
 ; CHECK-NEXT:    %21 = icmp eq i32 %20, 0
 ; CHECK-NEXT:    br i1 %21, label %omp_offload.cont, label %omp_offload.failed
@@ -232,27 +230,9 @@
   %.offload_baseptrs2 = alloca [2 x i8*], align 8
   %.offload_ptrs3 = alloca [2 x i8*], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
   store i32 %size, i32* %size.addr, align 4
   %call = call i32 @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars. Here though, the "issue"
-  ;        cannot be moved upwards because it's not guaranteed that rand()
-  ;        won't modify *a. Nevertheless, the "wait" can be moved downwards.
-  ;        The call should be replaced to something like ...
-  ; Issue - this can't be moved upwards, *a might have aliases.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
-  ; Wait - this is moved downards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %conv = zext i32 %size to i64
   %0 = shl nuw nsw i64 %conv, 3
   %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -288,7 +268,6 @@
   %19 = bitcast i8** %18 to double**
   store double* %a, double** %19, align 8
 
-  ; FIXME: The "wait" should be moved here.
   %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
 
   %21 = icmp eq i32 %20, 0
@@ -305,6 +284,10 @@
 }
 
 define internal void @heavyComputation2FallBack(i64 %size, double* %a) {
+; CHECK-LABEL: define {{[^@]+}}@heavyComputation2FallBack(i64 %size, double* %a)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ; Fallback for offloading function heavyComputation2.
   ret void
@@ -336,7 +319,9 @@
 ; CHECK-NEXT:    %.offload_baseptrs2 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs3 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    store i32 %size, i32* %size.addr, align 4
+
 ; CHECK-NEXT:    %call = call i32 @rand()
+
 ; CHECK-NEXT:    %conv = zext i32 %size to i64
 ; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
 ; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -355,7 +340,8 @@
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -370,6 +356,9 @@
 ; CHECK-NEXT:    %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
 ; CHECK-NEXT:    %19 = bitcast i8** %18 to double**
 ; CHECK-NEXT:    store double* %a, double** %19, align 8
+
+; CHECK-NEXT:    call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
 ; CHECK-NEXT:    %21 = icmp eq i32 %20, 0
 ; CHECK-NEXT:    br i1 %21, label %omp_offload.cont, label %omp_offload.failed
@@ -389,28 +378,11 @@
   %.offload_baseptrs2 = alloca [2 x i8*], align 8
   %.offload_ptrs3 = alloca [2 x i8*], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
   store i32 %size, i32* %size.addr, align 4
 
-  ; FIXME: The "issue" should be moved here.
+  ; FIXME: call i8* @__tgt_target_data_begin_issue(...) should be moved here.
   %call = call i32 @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. The call should be replaced to something
-  ;        like ...
-  ; Issue - this is moved upwards.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
-  ; Wait - this is moved downards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %conv = zext i32 %size to i64
   %0 = shl nuw nsw i64 %conv, 3
   %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -446,7 +418,6 @@
   %19 = bitcast i8** %18 to double**
   store double* %a, double** %19, align 8
 
-  ; FIXME: The "wait" should be moved here.
   %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
 
   %21 = icmp eq i32 %20, 0
@@ -463,6 +434,10 @@
 }
 
 define internal void @heavyComputation3FallBack(i64 %size, double* %a) {
+; CHECK-LABEL: define {{[^@]+}}@heavyComputation3FallBack(i64 %size, double* %a)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ; Fallback for offloading function heavyComputation3.
   ret void
@@ -487,7 +462,9 @@
 ; CHECK-NEXT:    %.offload_baseptrs = alloca [1 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs = alloca [1 x i8*], align 8
 ; CHECK-NEXT:    %.offload_sizes = alloca [1 x i64], align 8
+
 ; CHECK-NEXT:    %call = call i32 @rand()
+
 ; CHECK-NEXT:    %conv = zext i32 %size to i64
 ; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
 ; CHECK-NEXT:    %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -498,8 +475,12 @@
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
 ; CHECK-NEXT:    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
 ; CHECK-NEXT:    store i64 %0, i64* %5, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
+
 ; CHECK-NEXT:    %rem = urem i32 %call, %size
+
+; CHECK-NEXT:    call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
 ; CHECK-NEXT:    ret i32 %rem
 ;
@@ -508,26 +489,9 @@
   %.offload_ptrs = alloca [1 x i8*], align 8
   %.offload_sizes = alloca [1 x i64], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
-  ; FIXME: The "issue" should be moved here.
+  ; FIXME: call i8* @__tgt_target_data_begin_issue(...) should be moved here.
   %call = call i32 @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. The call should be replaced to something
-  ;        like ...
-  ; Issue - this is moved upwards.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
-  ; Wait - this is moved downards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %conv = zext i32 %size to i64
   %0 = shl nuw nsw i64 %conv, 3
   %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -542,7 +506,6 @@
 
   %rem = urem i32 %call, %size
 
-  ; FIXME: The "wait" should be moved here.
   call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
   ret i32 %rem
 }
@@ -555,7 +518,6 @@
 
 declare dso_local i32 @rand()
 
-; FIXME: These two function declarations must be generated after splitting the runtime function
-;        __tgt_target_data_begin.
-; declare dso_local i8* @__tgt_target_data_begin_issue(i64* dereferenceable(8), i32, i8**, i8**, i64*, i64*)
-; declare dso_local void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info* dereferenceable(8))
+; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_issue(i64, i32, i8**, i8**, i64*, i64*)
+; CHECK: declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+