Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -49,6 +50,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 
 #include <algorithm>
 
@@ -816,6 +818,9 @@
       // TODO: This should be folded into buildCustomStateMachine.
       Changed |= rewriteDeviceCodeStateMachine();
 
+      if (HideMemoryTransferLatency)
+        Changed |= hideMemTransfersLatency();
+
       if (remarksEnabled())
         analysisGlobalization();
 
@@ -833,8 +838,6 @@
 
       Changed |= deleteParallelRegions();
 
-      if (HideMemoryTransferLatency)
-        Changed |= hideMemTransfersLatency();
       Changed |= deduplicateRuntimeCalls();
       if (EnableParallelRegionMerging) {
         if (mergeParallelRegions()) {
@@ -1375,40 +1378,380 @@
     return Changed;
   }
 
+  bool
+  splitMapperToIssueAndWait(CallInst *RuntimeCall,
+                            std::pair<CallInst *, CallInst *> &IssueWaitPair) {
+    auto &IRBuilder = OMPInfoCache.OMPBuilder;
+    Function *F = RuntimeCall->getCaller();
+    const DataLayout &DL = M.getDataLayout();
+    Instruction *FirstInst = &(F->getEntryBlock().front());
+    AllocaInst *Handle = new AllocaInst(
+        IRBuilder.AsyncInfo, DL.getAllocaAddrSpace(), "handle", FirstInst);
+
+    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+    // Change RuntimeCall call site for its asynchronous version.
+    SmallVector<Value *, 16> Args;
+    for (auto &Arg : RuntimeCall->args())
+      Args.push_back(Arg.get());
+    Args.push_back(Handle);
+
+    auto IssueCall =
+        CallInst::Create(IssueDecl, Args, /*NameStr=*/"", RuntimeCall);
+    assert(IssueCall && "Incomplete spliting");
+    IssueWaitPair.first = IssueCall;
+
+    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+    Value *WaitParams[3] = {
+        RuntimeCall->getArgOperand(0),
+        RuntimeCall->getArgOperand(OffloadArray::DeviceIDArgNum), Handle};
+    auto WaitCall = CallInst::Create(WaitDecl, WaitParams, "", RuntimeCall);
+    assert(WaitCall && "Incomplete spliting");
+    IssueWaitPair.second = WaitCall;
+
+    RuntimeCall->eraseFromParent();
+
+    return true;
+  }
+
+  bool getUseTreeDuplicated(
+      SmallVector<std::pair<Instruction *, Instruction *>> &UseTreeDuplicated,
+      Instruction *I, Instruction *MoveI) {
+    UseTreeDuplicated.push_back({I, nullptr});
+
+    for (unsigned int i = 0; i < UseTreeDuplicated.size(); ++i)
+      for (User *U : dyn_cast<Value>(UseTreeDuplicated[i].first)->users()) {
+        if (dyn_cast<CallInst>(U))
+          continue;
+
+        assert((dyn_cast<GetElementPtrInst>(U) || dyn_cast<StoreInst>(U) ||
+                dyn_cast<BitCastInst>(U)) &&
+               "Problem in data mapping");
+
+        auto UInstr = cast<Instruction>(U);
+
+        if (isSafeToSpeculativelyExecute(UInstr))
+          UseTreeDuplicated.push_back({UInstr, UInstr->clone()});
+        else
+          UseTreeDuplicated.push_back({UInstr, nullptr});
+      }
+
+    SetVector<Instruction *> OperandSet;
+    for (auto Utd : UseTreeDuplicated)
+      OperandSet.insert(Utd.first);
+
+    for (unsigned int i = 0; i < OperandSet.size(); ++i) {
+      auto Ops = OperandSet[i];
+      for (auto &Op : Ops->operands()) {
+        if (!dyn_cast<Instruction>(Op))
+          continue;
+        if (!dyn_cast<GetElementPtrInst>(Op) && !dyn_cast<LoadInst>(Op) &&
+            !dyn_cast<BitCastInst>(Op) && !dyn_cast<BinaryOperator>(Op) &&
+            !dyn_cast<SExtInst>(Op))
+          continue;
+        if (OperandSet.contains(dyn_cast<Instruction>(Op)))
+          continue;
+        OperandSet.insert(dyn_cast<Instruction>(Op));
+        if (isSafeToSpeculativelyExecute(dyn_cast<Instruction>(Op)))
+          UseTreeDuplicated.push_back(
+              {dyn_cast<Instruction>(Op), dyn_cast<Instruction>(Op)->clone()});
+        else
+          UseTreeDuplicated.push_back({dyn_cast<Instruction>(Op), nullptr});
+      }
+    }
+
+    SmallSet<int, 16> IndexSet;
+    SmallSet<Value *, 16> InsertSet;
+    unsigned int VecSize = UseTreeDuplicated.size();
+
+    while (IndexSet.size() < VecSize - 1)
+      for (unsigned int i = 1; i < VecSize; ++i) {
+        if (IndexSet.contains(i))
+          continue;
+        bool InsertFlag = true;
+        for (auto &Op : UseTreeDuplicated[i].first->operands()) {
+          if (!dyn_cast<Instruction>(Op))
+            continue;
+          // if its operand is supposed to be inserted, but it is not there yet
+          if ((cast<Instruction>(Op) != I) &&
+              OperandSet.contains(cast<Instruction>(Op)) &&
+              !InsertSet.contains(cast<Value>(Op))) {
+            InsertFlag = false;
+            break;
+          }
+        }
+        // cannot be inserted yet
+        if (!InsertFlag)
+          continue;
+        InsertSet.insert(cast<Value>(UseTreeDuplicated[i].first));
+        IndexSet.insert(i);
+
+        // insert the duplicate if it exists
+        if (UseTreeDuplicated[i].second)
+          UseTreeDuplicated[i].second->insertBefore(UseTreeDuplicated[i].first);
+
+        // move the instruction to the right BB
+        UseTreeDuplicated[i].first->moveBefore(MoveI);
+      }
+
+    return true;
+  }
+
+  Function *outlineMapperIssueRT(CallInst &RuntimeCall) {
+    BasicBlock *RuntimeCallBB = RuntimeCall.getParent();
+    Value *BasePtrsArg =
+        RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
+    // %offload_ptrs.
+    Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
+    // %offload_sizes.
+    Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
+    auto *BasePtrsArray =
+        dyn_cast<AllocaInst>(getUnderlyingObject(BasePtrsArg));
+    auto *PtrsArray = dyn_cast<AllocaInst>(getUnderlyingObject(PtrsArg));
+    auto *SizesArray = dyn_cast<AllocaInst>(getUnderlyingObject(SizesArg));
+    if (!BasePtrsArray || !PtrsArray)
+      return nullptr;
+
+    // split bb to be outlined
+    BasicBlock *MapperBB =
+        RuntimeCallBB->splitBasicBlock(&RuntimeCall, "mapper.bb");
+    BasicBlock *NextBB =
+        MapperBB->splitBasicBlock(RuntimeCall.getNextNode(), "next.bb");
+    SmallVector<std::pair<Instruction *, Instruction *>> BasePtrsUses;
+    SmallVector<std::pair<Instruction *, Instruction *>> PtrsUses;
+
+    getUseTreeDuplicated(BasePtrsUses, BasePtrsArray, (&RuntimeCall));
+    getUseTreeDuplicated(PtrsUses, PtrsArray, (&RuntimeCall));
+
+    for (auto Bpu : BasePtrsUses)
+      if (Bpu.second)
+        Bpu.first->replaceUsesOutsideBlock(Bpu.second, MapperBB);
+    for (auto Pu : PtrsUses)
+      if (Pu.second)
+        Pu.first->replaceUsesOutsideBlock(Pu.second, MapperBB);
+
+    // For cases with constant array size.
+    if (SizesArray) {
+      SmallVector<std::pair<Instruction *, Instruction *>> SizesArrayUses;
+      getUseTreeDuplicated(SizesArrayUses, SizesArray, (&RuntimeCall));
+      for (auto Sau : SizesArrayUses)
+        if (Sau.second)
+          Sau.first->replaceUsesOutsideBlock(Sau.second, MapperBB);
+    }
+    SmallVector<BasicBlock *, 1> ExtractBB{MapperBB};
+    CodeExtractor CE(ExtractBB);
+    CodeExtractorAnalysisCache CEAC(*(MapperBB->getParent()));
+    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
+    if (!OutlinedFunc)
+      return nullptr;
+
+    OutlinedFunc->setName("__openmp_mapper_issue_wrapper_" +
+                          MapperBB->getParent()->getName());
+
+    MergeBlockIntoPredecessor(NextBB);
+    // Annotating the outlined function
+    OutlinedFunc->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+    int i = 0;
+    for (auto &A : OutlinedFunc->args()) {
+      if (A.getType()->isPointerTy())
+        OutlinedFunc->addParamAttr(i, Attribute::NoCapture);
+      i++;
+    }
+    return OutlinedFunc;
+  }
+  // returns true if \p CI can move before or after \p I
+  bool canMoveThrough(CallInst *CI, Instruction *I, AliasAnalysis &AA,
+                      SmallSet<Value *, 20> &CIArgs) {
+    if (CIArgs.contains(I))
+      return false;
+    if (!(I->mayHaveSideEffects()))
+      return true;
+    auto MR = AA.getModRefInfo(I, CI);
+    return isNoModRef(MR);
+  }
+
+  bool canMoveThroughBlock(CallInst *CI, BasicBlock *B, AliasAnalysis &AA,
+                           SmallSet<Value *, 20> &CIArgs) {
+    for (auto &I : *B)
+      if (!canMoveThrough(CI, &I, AA, CIArgs))
+        return false;
+    return true;
+  }
+
+  // This function gets the current BB of 'issue' and returns the
+  // next BB the 'issue' function can safely move to.
+  BasicBlock *findNextBBToCheckForMoving(CallInst *CI, BasicBlock *B,
+                                         AliasAnalysis &AA, DominatorTree &DT,
+                                         PostDominatorTree &PDT, LoopInfo &LI,
+                                         SmallSet<Value *, 20> &CIArgs) {
+    Loop *BLoop = LI.getLoopFor(B);
+    BasicBlock *DestBlock = B;
+    BasicBlock *DomBlock;
+
+    while (1) {
+      auto Dom = DT.getNode(DestBlock)->getIDom();
+      if (!Dom)
+        return B;
+      DomBlock = Dom->getBlock();
+      Loop *DomLoop = LI.getLoopFor(DomBlock);
+      if (BLoop == DomLoop)
+        break;
+      DestBlock = DomBlock;
+    }
+    SetVector<BasicBlock *> PredecessorBB;
+    PredecessorBB.insert(B);
+
+    for (unsigned int i = 0; i < PredecessorBB.size(); ++i)
+      for (BasicBlock *S : predecessors(PredecessorBB[i])) {
+        if (S == DomBlock) {
+          continue;
+        }
+        if (!canMoveThroughBlock(CI, S, AA, CIArgs))
+          return B;
+        PredecessorBB.insert(S);
+      }
+
+    bool PostDomination = PDT.dominates(B, DomBlock);
+
+    if (PostDomination)
+      return DomBlock;
+    return B;
+  }
+
+  bool moveWaitRTCInOrigBB(CallInst *IssueWrapperCall, CallInst *RTCallWait,
+                           AliasAnalysis &AA, SmallSet<Value *, 20> &CIArgs) {
+    Instruction *I = RTCallWait;
+    Instruction *WaitMovePoint;
+
+    while ((I = I->getNextNonDebugInstruction()))
+      if (!canMoveThrough(IssueWrapperCall, I, AA, CIArgs)) {
+        WaitMovePoint = I;
+        break;
+      }
+    if (!I)
+      WaitMovePoint = RTCallWait->getParent()->getTerminator();
+    RTCallWait->moveBefore(WaitMovePoint);
+    return true;
+  }
+
+  bool moveIssueRTCInBB(CallInst *IssueWrapperCall, BasicBlock *CurrentBB,
+                        AliasAnalysis &AA, SmallSet<Value *, 20> &IssueCallArgs,
+                        bool IsOrigBB = 0) {
+    Instruction *IssuMovePoint;
+    Instruction *I;
+    if (CurrentBB == IssueWrapperCall->getParent())
+      I = IssueWrapperCall;
+    else
+      I = CurrentBB->getTerminator();
+
+    while ((I = I->getPrevNonDebugInstruction()))
+      if (!canMoveThrough(IssueWrapperCall, I, AA, IssueCallArgs)) {
+        IssuMovePoint = I;
+        break;
+      }
+
+    // The function can move beyond its original BB, for the
+    // time being, move it to the begining of the BB.
+    if (!I && IsOrigBB) {
+      IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI());
+      return false;
+    }
+    // insert issue in the very begining of the BB.
+    if (!I) {
+      IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI());
+      return true;
+    }
+    IssueWrapperCall->moveAfter(IssuMovePoint);
+    return true;
+  }
+
+  bool moveIssueAndWaitRTC(CallInst *IssueWrapperCall, CallInst *RTCallWait,
+                           AliasAnalysis &AA, LoopInfo &LI, DominatorTree &DT,
+                           PostDominatorTree &PDT, BasicBlock *IssueBB) {
+
+    SmallSet<Value *, 20> IssueCallArgs;
+    for (auto &A : IssueWrapperCall->args())
+      IssueCallArgs.insert(A);
+
+    moveWaitRTCInOrigBB(IssueWrapperCall, RTCallWait, AA, IssueCallArgs);
+    bool IssueMovedInOrigBB =
+        moveIssueRTCInBB(IssueWrapperCall, IssueBB, AA, IssueCallArgs, 1);
+    if (IssueMovedInOrigBB)
+      return true;
+
+    BasicBlock *CurrentBB = IssueBB;
+    BasicBlock *NextBB;
+    while ((NextBB = findNextBBToCheckForMoving(IssueWrapperCall, CurrentBB, AA,
+                                                DT, PDT, LI, IssueCallArgs))) {
+      // it cannot move anymore
+      if (NextBB == CurrentBB)
+        break;
+      CurrentBB = NextBB;
+    }
+    if (CurrentBB == IssueBB)
+      moveIssueRTCInBB(IssueWrapperCall, CurrentBB, AA, IssueCallArgs, 1);
+    else
+      moveIssueRTCInBB(IssueWrapperCall, CurrentBB, AA, IssueCallArgs);
+    return true;
+  }
+
   /// Tries to hide the latency of runtime calls that involve host to
   /// device memory transfers by splitting them into their "issue" and "wait"
   /// versions. The "issue" is moved upwards as much as possible. The "wait" is
-  /// moved downards as much as possible. The "issue" issues the memory transfer
-  /// asynchronously, returning a handle. The "wait" waits in the returned
-  /// handle for the memory transfer to finish.
-  bool hideMemTransfersLatency() {
-    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+  /// moved downards as much as possible in its original BB. The "issue" issues
+  /// the memory transfer asynchronously, returning a handle. The "wait" waits
+  /// in the returned handle for the memory transfer to finish.
+  bool hideMemTransfersLatencyHelper(Function &F) {
+
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
     bool Changed = false;
-    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+    auto AsyncMemTransfers = [&](Use &U, Function &Decl) {
       auto *RTCall = getCallIfRegularCall(U, &RFI);
       if (!RTCall)
         return false;
+      Changed = true;
 
-      OffloadArray OffloadArrays[3];
-      if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
-        return false;
-
-      LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
-
-      // TODO: Check if can be moved upwards.
-      bool WasSplit = false;
-      Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
-      if (WaitMovementPoint)
-        WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
+      std::pair<CallInst *, CallInst *> IssueWaitPair;
+      splitMapperToIssueAndWait(RTCall, IssueWaitPair);
+      CallInst *RTCallIssue = IssueWaitPair.first;
+      CallInst *RTCallWait = IssueWaitPair.second;
+      BasicBlock *IssueBB = RTCallIssue->getParent();
+      Function *IssueWrapper = outlineMapperIssueRT(*RTCallIssue);
+      assert(IssueWrapper && "Incomplete outlining in latency hiding");
+      // there is a one to one relation between RFI and IssueWrapper
+      CallInst *IssueWrapperCall =
+          cast<CallInst>(*(IssueWrapper->users().begin()));
+      assert(IssueWrapperCall && "Incomplete outlining in latency hiding");
+
+      AliasAnalysis *AA =
+          OMPInfoCache.getAnalysisResultForFunction<AAManager>(F);
+      LoopInfo *LI = OMPInfoCache.getAnalysisResultForFunction<LoopAnalysis>(F);
+      DominatorTree *DT =
+          OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F);
+      PostDominatorTree *PDT =
+          OMPInfoCache.getAnalysisResultForFunction<PostDominatorTreeAnalysis>(
+              F);
+
+      moveIssueAndWaitRTC(IssueWrapperCall, RTCallWait, *AA, *LI, *DT, *PDT,
+                          IssueBB);
 
-      Changed |= WasSplit;
-      return WasSplit;
+      return true;
     };
-    RFI.foreachUse(SCC, SplitMemTransfers);
 
+    RFI.foreachUse(AsyncMemTransfers, &F);
     return Changed;
   }
 
+  bool hideMemTransfersLatency() {
+    for (auto &F : M)
+      hideMemTransfersLatencyHelper(F);
+    return true;
+  }
+
   /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels.
   /// TODO: Make this an AA and expand it to work across blocks and functions.
   bool eliminateBarriers() {
@@ -1719,81 +2062,6 @@
     LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
   }
 
-  /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
-  /// moved. Returns nullptr if the movement is not possible, or not worth it.
-  Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
-    // FIXME: This traverses only the BasicBlock where RuntimeCall is.
-    //  Make it traverse the CFG.
-
-    Instruction *CurrentI = &RuntimeCall;
-    bool IsWorthIt = false;
-    while ((CurrentI = CurrentI->getNextNode())) {
-
-      // TODO: Once we detect the regions to be offloaded we should use the
-      //  alias analysis manager to check if CurrentI may modify one of
-      //  the offloaded regions.
-      if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
-        if (IsWorthIt)
-          return CurrentI;
-
-        return nullptr;
-      }
-
-      // FIXME: For now if we move it over anything without side effect
-      //  is worth it.
-      IsWorthIt = true;
-    }
-
-    // Return end of BasicBlock.
-    return RuntimeCall.getParent()->getTerminator();
-  }
-
-  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
-  bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
-                               Instruction &WaitMovementPoint) {
-    // Create stack allocated handle (__tgt_async_info) at the beginning of the
-    // function. Used for storing information of the async transfer, allowing to
-    // wait on it later.
-    auto &IRBuilder = OMPInfoCache.OMPBuilder;
-    auto *F = RuntimeCall.getCaller();
-    Instruction *FirstInst = &(F->getEntryBlock().front());
-    AllocaInst *Handle = new AllocaInst(
-        IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
-
-    // Add "issue" runtime call declaration:
-    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
-    //   i8**, i8**, i64*, i64*)
-    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
-        M, OMPRTL___tgt_target_data_begin_mapper_issue);
-
-    // Change RuntimeCall call site for its asynchronous version.
-    SmallVector<Value *, 16> Args;
-    for (auto &Arg : RuntimeCall.args())
-      Args.push_back(Arg.get());
-    Args.push_back(Handle);
-
-    CallInst *IssueCallsite =
-        CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
-    OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
-    RuntimeCall.eraseFromParent();
-
-    // Add "wait" runtime call declaration:
-    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
-    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
-        M, OMPRTL___tgt_target_data_begin_mapper_wait);
-
-    Value *WaitParams[2] = {
-        IssueCallsite->getArgOperand(
-            OffloadArray::DeviceIDArgNum), // device_id.
-        Handle                             // handle to wait on.
-    };
-    CallInst *WaitCallsite = CallInst::Create(
-        WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
-    OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
-
-    return true;
-  }
-
   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                     bool GlobalOnly, bool &SingleChoice) {
     if (CurrentIdent == NextIdent)
Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
===================================================================
--- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,528 +1,306 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -S -passes=openmp-opt-cgscc -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK: %struct.__tgt_async_info = type { i8* }
-
-%struct.ident_t = type { i32, i32, i32, i32, i8* }
-%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
-
-@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
-@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0
-@.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8]
-@.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800]
-
-@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0
-@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=openmp-opt -openmp-hide-memory-transfer-latency < %s | FileCheck %s
 
-@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0
-@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0]
-@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544]
-
-@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33]
-
-@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
-@.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
-
-;double heavyComputation1() {
-;  double a = rand() % 777;
-;  double random = rand();
-;
-;  //#pragma omp target data map(a)
-;  void* args[1];
-;  args[0] = &a;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  #pragma omp target teams
-;  for (int i = 0; i < 1000; ++i) {
-;    a *= i*i / 2;
-;  }
-;
-;  return random + a;
-;}
-define dso_local double @heavyComputation1() {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation1() {
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+
+@global = dso_local global i32 0, align 4
+@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 3]
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+@.str = private unnamed_addr constant [15 x i8] c"*** Result %d\0A\00", align 1
+@.str.1 = private unnamed_addr constant [7 x i8] c"Begin\0A\00", align 1
+@llvm.embedded.object = private constant [976 x i8] c"\10\FF\10\AD\01\00\00\00\D0\03\00\00\00\00\00\00 \00\00\00\00\00\00\00(\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00H\00\00\00\00\00\00\00\02\00\00\00\00\00\00\00\90\00\00\00\00\00\00\00:\03\00\00\00\00\00\00n\00\00\00\00\00\00\00u\00\00\00\00\00\00\00i\00\00\00\00\00\00\00\89\00\00\00\00\00\00\00\00arch\00triple\00nvptx64-nvidia-cuda\00sm_61\00\00; ModuleID = 'offload-openmp-nvptx64-nvidia-cuda.tmp.bc'\0Asource_filename = \22offload.c\22\0Atarget datalayout = \22e-i64:64-i128:128-v16:16-v32:32-n16:32:64\22\0Atarget triple = \22nvptx64-nvidia-cuda\22\0A\0A@__omp_rtl_debug_kind = weak_odr hidden constant i32 0\0A@__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0\0A@__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0\0A@__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0\0A\0A!llvm.module.flags = !{!0, !1, !2, !3, !4}\0A!llvm.ident = !{!5}\0A\0A!0 = !{i32 1, !\22wchar_size\22, i32 4}\0A!1 = !{i32 7, !\22openmp\22, i32 50}\0A!2 = !{i32 7, !\22openmp-device\22, i32 50}\0A!3 = !{i32 7, !\22PIC Level\22, i32 2}\0A!4 = !{i32 7, !\22frame-pointer\22, i32 2}\0A!5 = !{!\22clang version 16.0.0 (https://github.com/llvm/llvm-project.git cf1d9a1fdca258ab56f3060dfa4a303b8127350e)\22}\0A\00\00\00\00\00\00", section ".llvm.offloading", align 8, !exclude !0
+@llvm.compiler.used = appending global [1 x ptr] [ptr @llvm.embedded.object], section "llvm.metadata"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @test_func(i32 noundef %N) #0 {
+; CHECK-LABEL: @test_func(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8
-; CHECK-NEXT:    [[A:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to i8*
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[CALL]], 777
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to double
-; CHECK-NEXT:    store double [[CONV]], double* [[A]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0:[0-9]+]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to i64*
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]] to i64*
-; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS5]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS5]] to i64*
-; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull [[TMP7]], i8** nonnull [[TMP9]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]]
-; CHECK:       omp_offload.failed:
-; CHECK-NEXT:    call void @heavyComputation1FallBack(i64 [[TMP6]])
-; CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
-; CHECK:       omp_offload.cont:
-; CHECK-NEXT:    [[CONV2:%.*]] = sitofp i32 [[CALL1]] to double
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[A]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[CONV2]]
-; CHECK-NEXT:    ret double [[ADD]]
-;
-
-
-
-
-
-
-entry:
-  %a = alloca double, align 8
-  %.offload_baseptrs = alloca [1 x i8*], align 8
-  %.offload_ptrs = alloca [1 x i8*], align 8
-  %.offload_baseptrs4 = alloca [1 x i8*], align 8
-  %.offload_ptrs5 = alloca [1 x i8*], align 8
-
-  %0 = bitcast double* %a to i8*
-  %call = tail call i32 (...) @rand()
-  %rem = srem i32 %call, 777
-  %conv = sitofp i32 %rem to double
-  store double %conv, double* %a, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here.
-  %call1 = tail call i32 (...) @rand()
-
-  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null)
-
-  %5 = bitcast double* %a to i64*
-  %6 = load i64, i64* %5, align 8
-  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
-  %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64*
-  store i64 %6, i64* %8, align 8
-  %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0
-  %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
-  store i64 %6, i64* %10, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
-  %11 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-  %.not = icmp eq i32 %11, 0
-  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
-
-omp_offload.failed:                               ; preds = %entry
-  call void @heavyComputation1FallBack(i64 %6)
-  br label %omp_offload.cont
-
-omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
-  %conv2 = sitofp i32 %call1 to double
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null)
-  %12 = load double, double* %a, align 8
-  %add = fadd double %12, %conv2
-  ret double %add
-}
-
-define internal void @heavyComputation1FallBack(i64 %a) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation1FallBack
-; CHECK-SAME: (i64 [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
-;
-entry:
-  ; Fallback for offloading function heavyComputation1.
-  ret void
-}
-
-;int heavyComputation2(double* a, unsigned size) {
-;  int random = rand() % 7;
-;
-;  //#pragma omp target data map(a[0:size], size)
-;  void* args[2];
-;  args[0] = &a;
-;  args[1] = &size;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  #pragma omp target teams
-;  for (int i = 0; i < size; ++i) {
-;    a[i] = ++a[i] * 3.141624;
-;  }
-;
-;  return random;
-;}
-define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation2
-; CHECK-SAME: (double* [[A:%.*]], i32 [[SIZE:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SIZE_ADDR:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0
-; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1
-; CHECK-NEXT:    store i64 4, i64* [[TMP10]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]]
-; CHECK:       omp_offload.failed:
-; CHECK-NEXT:    call void @heavyComputation2FallBack(i64 [[SIZE_CASTED]], double* [[A]])
-; CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
-; CHECK:       omp_offload.cont:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[CALL]], 7
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    ret i32 [[REM]]
-;
-
-
-entry:
-  %size.addr = alloca i32, align 4
-  %.offload_baseptrs = alloca [2 x i8*], align 8
-  %.offload_ptrs = alloca [2 x i8*], align 8
-  %.offload_sizes = alloca [2 x i64], align 8
-  %.offload_baseptrs2 = alloca [2 x i8*], align 8
-  %.offload_ptrs3 = alloca [2 x i8*], align 8
-
-  store i32 %size, i32* %size.addr, align 4
-  %call = tail call i32 (...) @rand()
-
-  %conv = zext i32 %size to i64
-  %0 = shl nuw nsw i64 %conv, 3
-  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
-  store i64 %0, i64* %5, align 8
-  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
-  %7 = bitcast i8** %6 to i32**
-  store i32* %size.addr, i32** %7, align 8
-  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
-  %9 = bitcast i8** %8 to i32**
-  store i32* %size.addr, i32** %9, align 8
-  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
-  store i64 4, i64* %10, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-
-  %11 = load i32, i32* %size.addr, align 4
-  %size.casted = zext i32 %11 to i64
-  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
-  %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
-  store i64 %size.casted, i64* %13, align 8
-  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
-  %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
-  store i64 %size.casted, i64* %15, align 8
-  %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
-  %17 = bitcast i8** %16 to double**
-  store double* %a, double** %17, align 8
-  %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
-  %19 = bitcast i8** %18 to double**
-  store double* %a, double** %19, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
-  %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-  %.not = icmp eq i32 %20, 0
-  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
-
-omp_offload.failed:                               ; preds = %entry
-  call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
-  br label %omp_offload.cont
-
-omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
-  %rem = srem i32 %call, 7
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-  ret i32 %rem
-}
-
-define internal void @heavyComputation2FallBack(i64 %size, double* %a) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation2FallBack
-; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 4, [[CONV]]
+; CHECK-NEXT:    [[CALL:%.*]] = call noalias ptr @malloc(i64 noundef [[MUL]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[I_0]], 8
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I_0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 4, [[CONV2]]
+; CHECK-NEXT:    [[CALL4:%.*]] = call noalias ptr @malloc(i64 noundef [[MUL3]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @__openmp_mapper_issue_wrapper_test_func.mapper.bb(ptr [[DOTOFFLOAD_BASEPTRS]], ptr [[CALL]], ptr [[DOTOFFLOAD_PTRS]], ptr [[DOTOFFLOAD_SIZES]], i32 [[N]], ptr [[HANDLE]])
+; CHECK-NEXT:    br label [[FOR_COND6:%.*]]
+; CHECK:       for.cond6:
+; CHECK-NEXT:    [[I5_0:%.*]] = phi i32 [ 0, [[FOR_END]] ], [ [[INC13:%.*]], [[FOR_INC12:%.*]] ]
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[I5_0]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY9:%.*]], label [[FOR_END14:%.*]]
+; CHECK:       for.body9:
+; CHECK-NEXT:    [[IDXPROM10:%.*]] = sext i32 [[I5_0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[CALL4]], i64 [[IDXPROM10]]
+; CHECK-NEXT:    store i32 [[I5_0]], ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    br label [[FOR_INC12]]
+; CHECK:       for.inc12:
+; CHECK-NEXT:    [[INC13]] = add nsw i32 [[I5_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       for.end14:
+; CHECK-NEXT:    br label [[FOR_COND16:%.*]]
+; CHECK:       for.cond16:
+; CHECK-NEXT:    [[I15_0:%.*]] = phi i32 [ 0, [[FOR_END14]] ], [ [[INC38:%.*]], [[FOR_INC37:%.*]] ]
+; CHECK-NEXT:    [[CMP17:%.*]] = icmp slt i32 [[I15_0]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY19:%.*]], label [[FOR_END39:%.*]]
+; CHECK:       for.body19:
+; CHECK-NEXT:    br label [[FOR_COND20:%.*]]
+; CHECK:       for.cond20:
+; CHECK-NEXT:    [[J_0:%.*]] = phi i32 [ 0, [[FOR_BODY19]] ], [ [[INC35:%.*]], [[FOR_INC34:%.*]] ]
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i32 [[J_0]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY23:%.*]], label [[FOR_END36:%.*]]
+; CHECK:       for.body23:
+; CHECK-NEXT:    br label [[FOR_COND24:%.*]]
+; CHECK:       for.cond24:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i32 [ 0, [[FOR_BODY23]] ], [ [[INC32:%.*]], [[FOR_INC31:%.*]] ]
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp slt i32 [[K_0]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP25]], label [[FOR_BODY27:%.*]], label [[FOR_END33:%.*]]
+; CHECK:       for.body27:
+; CHECK-NEXT:    [[ADD28:%.*]] = add nsw i32 [[I15_0]], [[J_0]]
+; CHECK-NEXT:    [[ADD29:%.*]] = add nsw i32 [[ADD28]], [[K_0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @global, align 4
+; CHECK-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP0]], [[ADD29]]
+; CHECK-NEXT:    store i32 [[ADD30]], ptr @global, align 4
+; CHECK-NEXT:    br label [[FOR_INC31]]
+; CHECK:       for.inc31:
+; CHECK-NEXT:    [[INC32]] = add nsw i32 [[K_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND24]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end33:
+; CHECK-NEXT:    br label [[FOR_INC34]]
+; CHECK:       for.inc34:
+; CHECK-NEXT:    [[INC35]] = add nsw i32 [[J_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND20]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       for.end36:
+; CHECK-NEXT:    br label [[FOR_INC37]]
+; CHECK:       for.inc37:
+; CHECK-NEXT:    [[INC38]] = add nsw i32 [[I15_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND16]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       for.end39:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @global, align 4
+; CHECK-NEXT:    [[INC40:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[INC40]], ptr @global, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+; CHECK-NEXT:    br label [[CODEREPL:%.*]]
+; CHECK:       codeRepl:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[N]], 1
+; CHECK-NEXT:    [[IDXPROM43:%.*]] = sext i32 [[SUB]] to i64
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[IDXPROM43]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(ptr @[[GLOB1:[0-9]+]], i64 -1, ptr [[HANDLE]])
+; CHECK-NEXT:    [[CALL45:%.*]] = call i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(ptr @[[GLOB1]], i64 -1, i32 1, ptr [[TMP13]], ptr [[TMP14]], ptr [[TMP15]], ptr @.offload_maptypes, ptr null, ptr null)
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  ; Fallback for offloading function heavyComputation2.
+  %.offload_baseptrs = alloca [1 x ptr], align 8
+  %.offload_ptrs = alloca [1 x ptr], align 8
+  %.offload_mappers = alloca [1 x ptr], align 8
+  %.offload_sizes = alloca [1 x i64], align 8
+  %conv = sext i32 %N to i64
+  %mul = mul i64 4, %conv
+  %call = call noalias ptr @malloc(i64 noundef %mul) #4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 8
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %call, i64 %idxprom
+  store i32 %add, ptr %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond, !llvm.loop !9
+
+for.end:                                          ; preds = %for.cond
+  %conv2 = sext i32 %N to i64
+  %mul3 = mul i64 4, %conv2
+  %call4 = call noalias ptr @malloc(i64 noundef %mul3) #4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc12, %for.end
+  %i5.0 = phi i32 [ 0, %for.end ], [ %inc13, %for.inc12 ]
+  %cmp7 = icmp slt i32 %i5.0, %N
+  br i1 %cmp7, label %for.body9, label %for.end14
+
+for.body9:                                        ; preds = %for.cond6
+  %idxprom10 = sext i32 %i5.0 to i64
+  %arrayidx11 = getelementptr inbounds i32, ptr %call4, i64 %idxprom10
+  store i32 %i5.0, ptr %arrayidx11, align 4
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body9
+  %inc13 = add nsw i32 %i5.0, 1
+  br label %for.cond6, !llvm.loop !11
+
+for.end14:                                        ; preds = %for.cond6
+  br label %for.cond16
+
+for.cond16:                                       ; preds = %for.inc37, %for.end14
+  %i15.0 = phi i32 [ 0, %for.end14 ], [ %inc38, %for.inc37 ]
+  %cmp17 = icmp slt i32 %i15.0, %N
+  br i1 %cmp17, label %for.body19, label %for.end39
+
+for.body19:                                       ; preds = %for.cond16
+  br label %for.cond20
+
+for.cond20:                                       ; preds = %for.inc34, %for.body19
+  %j.0 = phi i32 [ 0, %for.body19 ], [ %inc35, %for.inc34 ]
+  %cmp21 = icmp slt i32 %j.0, %N
+  br i1 %cmp21, label %for.body23, label %for.end36
+
+for.body23:                                       ; preds = %for.cond20
+  br label %for.cond24
+
+for.cond24:                                       ; preds = %for.inc31, %for.body23
+  %k.0 = phi i32 [ 0, %for.body23 ], [ %inc32, %for.inc31 ]
+  %cmp25 = icmp slt i32 %k.0, %N
+  br i1 %cmp25, label %for.body27, label %for.end33
+
+for.body27:                                       ; preds = %for.cond24
+  %add28 = add nsw i32 %i15.0, %j.0
+  %add29 = add nsw i32 %add28, %k.0
+  %0 = load i32, ptr @global, align 4
+  %add30 = add nsw i32 %0, %add29
+  store i32 %add30, ptr @global, align 4
+  br label %for.inc31
+
+for.inc31:                                        ; preds = %for.body27
+  %inc32 = add nsw i32 %k.0, 1
+  br label %for.cond24, !llvm.loop !12
+
+for.end33:                                        ; preds = %for.cond24
+  br label %for.inc34
+
+for.inc34:                                        ; preds = %for.end33
+  %inc35 = add nsw i32 %j.0, 1
+  br label %for.cond20, !llvm.loop !13
+
+for.end36:                                        ; preds = %for.cond20
+  br label %for.inc37
+
+for.inc37:                                        ; preds = %for.end36
+  %inc38 = add nsw i32 %i15.0, 1
+  br label %for.cond16, !llvm.loop !14
+
+for.end39:                                        ; preds = %for.cond16
+  %1 = load i32, ptr @global, align 4
+  %inc40 = add nsw i32 %1, 1
+  store i32 %inc40, ptr @global, align 4
+  %arrayidx41 = getelementptr inbounds i32, ptr %call, i64 0
+  %conv42 = sext i32 %N to i64
+  %2 = mul nuw i64 %conv42, 4
+  %3 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+  store ptr %call, ptr %3, align 8
+  %4 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+  store ptr %arrayidx41, ptr %4, align 8
+  %5 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
+  store i64 %2, ptr %5, align 8
+  %6 = getelementptr inbounds [1 x ptr], ptr %.offload_mappers, i64 0, i64 0
+  store ptr null, ptr %6, align 8
+  %7 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+  %8 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+  %9 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
+  call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %7, ptr %8, ptr %9, ptr @.offload_maptypes, ptr null, ptr null)
+  %sub = sub nsw i32 %N, 1
+  %idxprom43 = sext i32 %sub to i64
+  %arrayidx44 = getelementptr inbounds i32, ptr %call, i64 %idxprom43
+  %10 = load i32, ptr %arrayidx44, align 4
+  %call45 = call i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef %10)
+  %11 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+  %12 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+  %13 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
+  call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %11, ptr %12, ptr %13, ptr @.offload_maptypes, ptr null, ptr null)
   ret void
 }
 
-;int heavyComputation3(double* restrict a, unsigned size) {
-;  int random = rand() % 7;
-;
-;  //#pragma omp target data map(a[0:size], size)
-;  void* args[2];
-;  args[0] = &a;
-;  args[1] = &size;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  #pragma omp target teams
-;  for (int i = 0; i < size; ++i) {
-;    a[i] = ++a[i] * 3.141624;
-;  }
-;
-;  return random;
-;}
-define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation3
-; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SIZE_ADDR:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0
-; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1
-; CHECK-NEXT:    store i64 4, i64* [[TMP10]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]]
-; CHECK:       omp_offload.failed:
-; CHECK-NEXT:    call void @heavyComputation3FallBack(i64 [[SIZE_CASTED]], double* [[A]])
-; CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
-; CHECK:       omp_offload.cont:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[CALL]], 7
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    ret i32 [[REM]]
-;
+; Function Attrs: nounwind allocsize(0)
+declare noalias ptr @malloc(i64 noundef) #1
 
+; Function Attrs: nounwind
+declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) #2
 
-entry:
-  %size.addr = alloca i32, align 4
-  %.offload_baseptrs = alloca [2 x i8*], align 8
-  %.offload_ptrs = alloca [2 x i8*], align 8
-  %.offload_sizes = alloca [2 x i64], align 8
-  %.offload_baseptrs2 = alloca [2 x i8*], align 8
-  %.offload_ptrs3 = alloca [2 x i8*], align 8
-  store i32 %size, i32* %size.addr, align 4
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here.
-  %call = tail call i32 (...) @rand()
-
-  %conv = zext i32 %size to i64
-  %0 = shl nuw nsw i64 %conv, 3
-  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
-  store i64 %0, i64* %5, align 8
-  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
-  %7 = bitcast i8** %6 to i32**
-  store i32* %size.addr, i32** %7, align 8
-  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
-  %9 = bitcast i8** %8 to i32**
-  store i32* %size.addr, i32** %9, align 8
-  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
-  store i64 4, i64* %10, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-
-  %11 = load i32, i32* %size.addr, align 4
-  %size.casted = zext i32 %11 to i64
-  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
-  %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
-  store i64 %size.casted, i64* %13, align 8
-  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
-  %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
-  store i64 %size.casted, i64* %15, align 8
-  %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
-  %17 = bitcast i8** %16 to double**
-  store double* %a, double** %17, align 8
-  %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
-  %19 = bitcast i8** %18 to double**
-  store double* %a, double** %19, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
-  %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-  %.not = icmp eq i32 %20, 0
-  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
-
-omp_offload.failed:                               ; preds = %entry
-  call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
-  br label %omp_offload.cont
-
-omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
-  %rem = srem i32 %call, 7
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-  ret i32 %rem
-}
+declare i32 @printf(ptr noundef, ...) #3
 
-define internal void @heavyComputation3FallBack(i64 %size, double* %a) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation3FallBack
-; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
-;
-entry:
-  ; Fallback for offloading function heavyComputation3.
-  ret void
-}
+; Function Attrs: nounwind
+declare void @__tgt_target_data_end_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) #2
 
-;int dataTransferOnly1(double* restrict a, unsigned size) {
-;  // Random computation.
-;  int random = rand();
-;
-;  //#pragma omp target data map(to:a[0:size])
-;  void* args[1];
-;  args[0] = &a;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  // Random computation.
-;  random %= size;
-;  return random;
-;}
-define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
-; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1
-; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) {
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main(i32 noundef %argc, ptr noundef %argv) #0 {
+; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0
-; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[CALL]], [[SIZE]]
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr, ...) @printf(ptr noundef @.str.1)
+; CHECK-NEXT:    call void @test_func(i32 noundef 10)
+; CHECK-NEXT:    ret i32 0
 ;
-
-
-
-
-
-
 entry:
-  %.offload_baseptrs = alloca [1 x i8*], align 8
-  %.offload_ptrs = alloca [1 x i8*], align 8
-  %.offload_sizes = alloca [1 x i64], align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_issue_mapper(...) should be moved here.
-  %call = tail call i32 (...) @rand()
-
-  %conv = zext i32 %size to i64
-  %0 = shl nuw nsw i64 %conv, 3
-  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
-  store i64 %0, i64* %5, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null)
-
-  %rem = urem i32 %call, %size
-
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null)
-  ret i32 %rem
+  %call = call i32 (ptr, ...) @printf(ptr noundef @.str.1)
+  call void @test_func(i32 noundef 10)
+  ret i32 0
 }
 
-declare void @__tgt_target_data_begin_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**)
-declare i32 @__tgt_target_teams_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**, i32, i32)
-declare void @__tgt_target_data_end_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**)
-
-declare dso_local i32 @rand(...)
-
-; CHECK: declare void @__tgt_target_data_begin_mapper_issue(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**, %struct.__tgt_async_info*)
-; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info*)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 7, !"openmp", i32 50}
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+attributes #3 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { nounwind allocsize(0) }
+
+!llvm.module.flags = !{!1, !2, !3, !4, !5, !6}
+!llvm.ident = !{!7}
+!llvm.embedded.objects = !{!8}
+
+!0 = !{}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 7, !"openmp", i32 50}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git cf1d9a1fdca258ab56f3060dfa4a303b8127350e)"}
+!8 = !{ptr @llvm.embedded.object, !".llvm.offloading"}
+!9 = distinct !{!9, !10}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = distinct !{!11, !10}
+!12 = distinct !{!12, !10}
+!13 = distinct !{!13, !10}
+!14 = distinct !{!14, !10}