Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -49,6 +50,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 
 #include <algorithm>
 
@@ -816,6 +818,9 @@
       // TODO: This should be folded into buildCustomStateMachine.
       Changed |= rewriteDeviceCodeStateMachine();
 
+      if (HideMemoryTransferLatency)
+        Changed |= hideMemTransfersLatency();
+
       if (remarksEnabled())
         analysisGlobalization();
 
@@ -833,8 +838,6 @@
 
       Changed |= deleteParallelRegions();
 
-      if (HideMemoryTransferLatency)
-        Changed |= hideMemTransfersLatency();
       Changed |= deduplicateRuntimeCalls();
       if (EnableParallelRegionMerging) {
         if (mergeParallelRegions()) {
@@ -1375,40 +1378,315 @@
     return Changed;
   }
 
+  bool
+  splitMapperToIssueAndWait(CallInst *RuntimeCall,
+                            std::pair<CallInst *, CallInst *> &IssueWaitPair) {
+    auto &IRBuilder = OMPInfoCache.OMPBuilder;
+    Function *F = RuntimeCall->getCaller();
+    const DataLayout &DL = M.getDataLayout();
+    Instruction *FirstInst = &(F->getEntryBlock().front());
+    AllocaInst *Handle = new AllocaInst(
+        IRBuilder.AsyncInfo, DL.getAllocaAddrSpace(), "handle", FirstInst);
+
+    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+    // Change RuntimeCall call site for its asynchronous version.
+    SmallVector<Value *, 16> Args;
+    for (auto &Arg : RuntimeCall->args())
+      Args.push_back(Arg.get());
+    Args.push_back(Handle);
+
+    auto IssueCall =
+        CallInst::Create(IssueDecl, Args, /*NameStr=*/"", RuntimeCall);
+    assert(IssueCall && "Incomplete spliting");
+    IssueWaitPair.first = IssueCall;
+
+    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+    Value *WaitParams[3] = {
+        RuntimeCall->getArgOperand(0),
+        RuntimeCall->getArgOperand(OffloadArray::DeviceIDArgNum), Handle};
+    auto WaitCall = CallInst::Create(WaitDecl, WaitParams, "", RuntimeCall);
+    assert(WaitCall && "Incomplete spliting");
+    IssueWaitPair.second = WaitCall;
+
+    RuntimeCall->eraseFromParent();
+
+    return true;
+  }
+
+  bool getUseTreeDuplicated(
+      SmallVector<std::pair<Instruction *, Instruction *>> &UseTreeDuplicated,
+      Instruction *I, Instruction *MoveI) {
+    UseTreeDuplicated.push_back({I, nullptr});
+
+    for (unsigned int i = 0; i < UseTreeDuplicated.size(); ++i)
+      for (User *U : dyn_cast<Value>(UseTreeDuplicated[i].first)->users()) {
+        if (dyn_cast<CallInst>(U))
+          continue;
+        assert((dyn_cast<GetElementPtrInst>(U) || dyn_cast<StoreInst>(U) ||
+                dyn_cast<BitCastInst>(U)) &&
+               "Problem in data mapping");
+        auto UInstr = cast<Instruction>(U);
+        if (isSafeToSpeculativelyExecute(UInstr)) // duplicate
+          UseTreeDuplicated.push_back({UInstr, UInstr->clone()});
+        else
+          UseTreeDuplicated.push_back({UInstr, nullptr});
+      }
+
+    int VecSize = UseTreeDuplicated.size();
+    for (int i = 1; i < VecSize; i++) {
+      // insert the duplicate if it exists
+      if (UseTreeDuplicated[i].second)
+        UseTreeDuplicated[i].second->insertBefore(UseTreeDuplicated[i].first);
+      // move the instruction to the right BB
+      UseTreeDuplicated[i].first->moveBefore(MoveI);
+    }
+
+    return true;
+  }
+
+  Function *outlineMapperIssueRT(CallInst &RuntimeCall) {
+    BasicBlock *RuntimeCallBB = RuntimeCall.getParent();
+    Value *BasePtrsArg =
+        RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
+    // i8** %offload_ptrs.
+    Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
+    // i8** %offload_sizes.
+    Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
+
+    auto *BasePtrsArray =
+        dyn_cast<AllocaInst>(getUnderlyingObject(BasePtrsArg));
+    auto *PtrsArray = dyn_cast<AllocaInst>(getUnderlyingObject(PtrsArg));
+    auto *SizesArray = dyn_cast<AllocaInst>(getUnderlyingObject(SizesArg));
+    if (!BasePtrsArray || !PtrsArray)
+      return nullptr;
+
+    // split bb to be outlined
+    BasicBlock *MapperBB =
+        RuntimeCallBB->splitBasicBlock(&RuntimeCall, "mapper.bb");
+    BasicBlock *NextBB =
+        MapperBB->splitBasicBlock(RuntimeCall.getNextNode(), "next.bb");
+
+    SmallVector<std::pair<Instruction *, Instruction *>> BasePtrsUses;
+    SmallVector<std::pair<Instruction *, Instruction *>> PtrsUses;
+
+    getUseTreeDuplicated(BasePtrsUses, BasePtrsArray, (&RuntimeCall));
+    getUseTreeDuplicated(PtrsUses, PtrsArray, (&RuntimeCall));
+
+    for (auto Bpu : BasePtrsUses)
+      if (Bpu.second)
+        Bpu.first->replaceUsesOutsideBlock(Bpu.second, MapperBB);
+    for (auto Pu : PtrsUses)
+      if (Pu.second)
+        Pu.first->replaceUsesOutsideBlock(Pu.second, MapperBB);
+
+    // For cases with constant array size.
+    if (SizesArray) {
+      SmallVector<std::pair<Instruction *, Instruction *>> SizesArrayUses;
+      getUseTreeDuplicated(SizesArrayUses, SizesArray, (&RuntimeCall));
+      for (auto Sau : SizesArrayUses)
+        if (Sau.second)
+          Sau.first->replaceUsesOutsideBlock(Sau.second, MapperBB);
+    }
+
+    SmallVector<BasicBlock *, 1> ExtractBB{MapperBB};
+    CodeExtractor CE(ExtractBB);
+    CodeExtractorAnalysisCache CEAC(*(MapperBB->getParent()));
+    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
+    if (!OutlinedFunc)
+      return nullptr;
+
+    OutlinedFunc->setName("__openmp_mapper_issue_wrapper_" +
+                          MapperBB->getParent()->getName());
+
+    MergeBlockIntoPredecessor(NextBB);
+    MergeBlockIntoPredecessor(RuntimeCall.getParent());
+
+    // Annotating the outlined function
+    OutlinedFunc->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+    OutlinedFunc->addFnAttr(Attribute::AlwaysInline);
+    int i = 0;
+    for (auto &A : OutlinedFunc->args()) {
+      if (A.getType()->isPointerTy())
+        OutlinedFunc->addParamAttr(i, Attribute::NoCapture);
+      i++;
+    }
+    return OutlinedFunc;
+  }
+  // returns true if \p CI can move before or after \p I
+  bool canMoveThrough(CallInst *CI, Instruction *I, AliasAnalysis &AA) {
+    if (!(I->mayHaveSideEffects()))
+      return true;
+
+    auto MR = AA.getModRefInfo(I, CI);
+    return isNoModRef(MR);
+  }
+  bool canMoveThroughBlock(CallInst *CI, BasicBlock *B, AliasAnalysis &AA) {
+    for (auto &I : *B)
+      if (!canMoveThrough(CI, &I, AA))
+        return false;
+    return true;
+  }
+
+  // This function gets the current BB of 'issue' and returns the
+  // next BB the 'issue' function can safely move to.
+  BasicBlock *findNextBBToCheckForMoving(CallInst *CI, BasicBlock *B,
+                                         AliasAnalysis &AA, DominatorTree &DT,
+                                         LoopInfo &LI) {
+    Loop *BLoop = LI.getLoopFor(B);
+    BasicBlock *DestBlock = B;
+    BasicBlock *DomBlock;
+
+    while (1) {
+      auto Dom = DT.getNode(DestBlock)->getIDom();
+      if (!Dom)
+        return B;
+      DomBlock = Dom->getBlock();
+      Loop *DomLoop = LI.getLoopFor(DomBlock);
+      if (BLoop == DomLoop)
+        break;
+      DestBlock = DomBlock;
+    }
+
+    SetVector<BasicBlock *> PredecessorBB;
+    PredecessorBB.insert(B);
+
+    for (unsigned int i = 0; i < PredecessorBB.size(); ++i)
+      for (BasicBlock *S : predecessors(PredecessorBB[i])) {
+        if (S == DomBlock)
+          continue;
+        if (!canMoveThroughBlock(CI, S, AA))
+          return B;
+        PredecessorBB.insert(S);
+      }
+
+    return DomBlock;
+  }
+
+  bool moveWaitRTCInOrigBB(CallInst *IssueWrapperCall, CallInst *RTCallWait,
+                           AliasAnalysis &AA) {
+    Instruction *I = RTCallWait;
+    Instruction *WaitMovePoint;
+
+    while ((I = I->getNextNonDebugInstruction()))
+      if (!canMoveThrough(IssueWrapperCall, I, AA)) {
+        WaitMovePoint = I;
+        break;
+      }
+    if (!I)
+      WaitMovePoint = RTCallWait->getParent()->getTerminator();
+    RTCallWait->moveBefore(WaitMovePoint);
+    return true;
+  }
+
+  bool moveIssueRTCInBB(CallInst *IssueWrapperCall, BasicBlock *CurrentBB,
+                        AliasAnalysis &AA) {
+    Instruction *IssuMovePoint;
+    bool IsOrigBB = (CurrentBB == IssueWrapperCall->getParent());
+    Instruction *I;
+    if (IsOrigBB)
+      I = IssueWrapperCall;
+    else
+      I = CurrentBB->getTerminator();
+
+    while ((I = I->getPrevNonDebugInstruction()))
+      if (!canMoveThrough(IssueWrapperCall, I, AA)) {
+        IssuMovePoint = I;
+        break;
+      }
+
+    // The function can move beyond its original BB, for the
+    // time being, move it to the begining of the BB.
+    if (!I && IsOrigBB) {
+      IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI());
+      return false;
+    }
+    // insert issue in the very begining of the BB.
+    if (!I) {
+      IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI());
+      return true;
+    }
+    IssueWrapperCall->moveAfter(IssuMovePoint);
+    return true;
+  }
+
+  bool moveIssueAndWaitRTC(CallInst *IssueWrapperCall, CallInst *RTCallWait,
+                           AliasAnalysis &AA, LoopInfo &LI, DominatorTree &DT) {
+    BasicBlock *IssueBB = IssueWrapperCall->getParent();
+    moveWaitRTCInOrigBB(IssueWrapperCall, RTCallWait, AA);
+    // Move issue in original BB.
+    bool IssueMovedInOrigBB = moveIssueRTCInBB(IssueWrapperCall, IssueBB, AA);
+    // cannot move beyond its original bb
+    if (IssueMovedInOrigBB)
+      return true;
+
+    BasicBlock *CurrentBB = IssueBB;
+    BasicBlock *NextBB;
+    while ((NextBB = findNextBBToCheckForMoving(IssueWrapperCall, CurrentBB, AA,
+                                                DT, LI))) {
+      // it cannot move anymore
+      if (NextBB == CurrentBB)
+        break;
+      CurrentBB = NextBB;
+    }
+    moveIssueRTCInBB(IssueWrapperCall, CurrentBB, AA);
+
+    return true;
+  }
+
   /// Tries to hide the latency of runtime calls that involve host to
   /// device memory transfers by splitting them into their "issue" and "wait"
   /// versions. The "issue" is moved upwards as much as possible. The "wait" is
-  /// moved downards as much as possible. The "issue" issues the memory transfer
-  /// asynchronously, returning a handle. The "wait" waits in the returned
-  /// handle for the memory transfer to finish.
-  bool hideMemTransfersLatency() {
-    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+  /// moved downards as much as possible in its original BB. The "issue" issues
+  /// the memory transfer asynchronously, returning a handle. The "wait" waits
+  /// in the returned handle for the memory transfer to finish.
+  bool hideMemTransfersLatencyHelper(Function &F) {
+
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
     bool Changed = false;
-    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+    auto AsyncMemTransfers = [&](Use &U, Function &Decl) {
       auto *RTCall = getCallIfRegularCall(U, &RFI);
       if (!RTCall)
         return false;
+      Changed = true;
 
-      OffloadArray OffloadArrays[3];
-      if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
-        return false;
+      std::pair<CallInst *, CallInst *> IssueWaitPair;
+      splitMapperToIssueAndWait(RTCall, IssueWaitPair);
+      CallInst *RTCallIssue = IssueWaitPair.first;
+      CallInst *RTCallWait = IssueWaitPair.second;
+      Function *IssueWrapper = outlineMapperIssueRT(*RTCallIssue);
+      assert(IssueWrapper && "Incomplete outlining in latency hiding");
+
+      // there is a one to one relation between RFI and IssueWrapper
+      CallInst *IssueWrapperCall =
+          cast<CallInst>(*(IssueWrapper->users().begin()));
+      assert(IssueWrapperCall && "Incomplete outlining in latency hiding");
 
-      LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
+      AliasAnalysis *AA =
+          OMPInfoCache.getAnalysisResultForFunction<AAManager>(F);
+      LoopInfo *LI = OMPInfoCache.getAnalysisResultForFunction<LoopAnalysis>(F);
+      DominatorTree *DT =
+          OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F);
 
-      // TODO: Check if can be moved upwards.
-      bool WasSplit = false;
-      Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
-      if (WaitMovementPoint)
-        WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
+      moveIssueAndWaitRTC(IssueWrapperCall, RTCallWait, *AA, *LI, *DT);
 
-      Changed |= WasSplit;
-      return WasSplit;
+      return true;
     };
-    RFI.foreachUse(SCC, SplitMemTransfers);
 
+    RFI.foreachUse(AsyncMemTransfers, &F);
     return Changed;
   }
 
+  bool hideMemTransfersLatency() {
+    for (auto &F : M)
+      hideMemTransfersLatencyHelper(F);
+    return true;
+  }
+
   /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels.
   /// TODO: Make this an AA and expand it to work across blocks and functions.
   bool eliminateBarriers() {
@@ -1719,81 +1997,6 @@
     LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
   }
 
-  /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
-  /// moved. Returns nullptr if the movement is not possible, or not worth it.
-  Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
-    // FIXME: This traverses only the BasicBlock where RuntimeCall is.
-    //  Make it traverse the CFG.
-
-    Instruction *CurrentI = &RuntimeCall;
-    bool IsWorthIt = false;
-    while ((CurrentI = CurrentI->getNextNode())) {
-
-      // TODO: Once we detect the regions to be offloaded we should use the
-      //  alias analysis manager to check if CurrentI may modify one of
-      //  the offloaded regions.
-      if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
-        if (IsWorthIt)
-          return CurrentI;
-
-        return nullptr;
-      }
-
-      // FIXME: For now if we move it over anything without side effect
-      //  is worth it.
-      IsWorthIt = true;
-    }
-
-    // Return end of BasicBlock.
-    return RuntimeCall.getParent()->getTerminator();
-  }
-
-  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
-  bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
-                               Instruction &WaitMovementPoint) {
-    // Create stack allocated handle (__tgt_async_info) at the beginning of the
-    // function. Used for storing information of the async transfer, allowing to
-    // wait on it later.
-    auto &IRBuilder = OMPInfoCache.OMPBuilder;
-    auto *F = RuntimeCall.getCaller();
-    Instruction *FirstInst = &(F->getEntryBlock().front());
-    AllocaInst *Handle = new AllocaInst(
-        IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
-
-    // Add "issue" runtime call declaration:
-    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
-    //   i8**, i8**, i64*, i64*)
-    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
-        M, OMPRTL___tgt_target_data_begin_mapper_issue);
-
-    // Change RuntimeCall call site for its asynchronous version.
-    SmallVector<Value *, 16> Args;
-    for (auto &Arg : RuntimeCall.args())
-      Args.push_back(Arg.get());
-    Args.push_back(Handle);
-
-    CallInst *IssueCallsite =
-        CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
-    OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
-    RuntimeCall.eraseFromParent();
-
-    // Add "wait" runtime call declaration:
-    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
-    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
-        M, OMPRTL___tgt_target_data_begin_mapper_wait);
-
-    Value *WaitParams[2] = {
-        IssueCallsite->getArgOperand(
-            OffloadArray::DeviceIDArgNum), // device_id.
-        Handle                             // handle to wait on.
-    };
-    CallInst *WaitCallsite = CallInst::Create(
-        WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
-    OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
-
-    return true;
-  }
-
   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                     bool GlobalOnly, bool &SingleChoice) {
     if (CurrentIdent == NextIdent)
Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
===================================================================
--- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,528 +1,417 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -S -passes=openmp-opt-cgscc -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK: %struct.__tgt_async_info = type { i8* }
-
-%struct.ident_t = type { i32, i32, i32, i32, i8* }
-%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
-
-@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
-@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0
-@.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8]
-@.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800]
-
-@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0
-@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -O3 -openmp-hide-memory-transfer-latency < %s | FileCheck %s
 
-@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0
-@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0]
-@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544]
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
-@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33]
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
 
-@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
-@.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@global = dso_local local_unnamed_addr global i32 0, align 4
+@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 1]
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+@llvm.embedded.object = private constant [1 x i8] c"0", section ".llvm.offloading", align 8, !exclude !0
+@llvm.compiler.used = appending global [1 x ptr] [ptr @llvm.embedded.object], section "llvm.metadata"
 
-;double heavyComputation1() {
-;  double a = rand() % 777;
-;  double random = rand();
-;
-;  //#pragma omp target data map(a)
-;  void* args[1];
-;  args[0] = &a;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  #pragma omp target teams
-;  for (int i = 0; i < 1000; ++i) {
-;    a *= i*i / 2;
-;  }
-;
-;  return random + a;
-;}
-define dso_local double @heavyComputation1() {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation1() {
+; Function Attrs: nounwind uwtable
+define dso_local void @test1(i32 noundef %N) local_unnamed_addr #0 {
+; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8
-; CHECK-NEXT:    [[A:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to i8*
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[CALL]], 777
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to double
-; CHECK-NEXT:    store double [[CONV]], double* [[A]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0:[0-9]+]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to i64*
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]] to i64*
-; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS5]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS5]] to i64*
-; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull [[TMP7]], i8** nonnull [[TMP9]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]]
-; CHECK:       omp_offload.failed:
-; CHECK-NEXT:    call void @heavyComputation1FallBack(i64 [[TMP6]])
-; CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
-; CHECK:       omp_offload.cont:
-; CHECK-NEXT:    [[CONV2:%.*]] = sitofp i32 [[CALL1]] to double
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[A]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[CONV2]]
-; CHECK-NEXT:    ret double [[ADD]]
-;
-
-
-
-
-
-
-entry:
-  %a = alloca double, align 8
-  %.offload_baseptrs = alloca [1 x i8*], align 8
-  %.offload_ptrs = alloca [1 x i8*], align 8
-  %.offload_baseptrs4 = alloca [1 x i8*], align 8
-  %.offload_ptrs5 = alloca [1 x i8*], align 8
-
-  %0 = bitcast double* %a to i8*
-  %call = tail call i32 (...) @rand()
-  %rem = srem i32 %call, 777
-  %conv = sitofp i32 %rem to double
-  store double %conv, double* %a, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here.
-  %call1 = tail call i32 (...) @rand()
-
-  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null)
-
-  %5 = bitcast double* %a to i64*
-  %6 = load i64, i64* %5, align 8
-  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
-  %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64*
-  store i64 %6, i64* %8, align 8
-  %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0
-  %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
-  store i64 %6, i64* %10, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
-  %11 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-  %.not = icmp eq i32 %11, 0
-  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
-
-omp_offload.failed:                               ; preds = %entry
-  call void @heavyComputation1FallBack(i64 %6)
-  br label %omp_offload.cont
-
-omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
-  %conv2 = sitofp i32 %call1 to double
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null)
-  %12 = load double, double* %a, align 8
-  %add = fadd double %12, %conv2
-  ret double %add
-}
-
-define internal void @heavyComputation1FallBack(i64 %a) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation1FallBack
-; CHECK-SAME: (i64 [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
-;
-entry:
-  ; Fallback for offloading function heavyComputation1.
-  ret void
-}
-
-;int heavyComputation2(double* a, unsigned size) {
-;  int random = rand() % 7;
-;
-;  //#pragma omp target data map(a[0:size], size)
-;  void* args[2];
-;  args[0] = &a;
-;  args[1] = &size;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  #pragma omp target teams
-;  for (int i = 0; i < size; ++i) {
-;    a[i] = ++a[i] * 3.141624;
-;  }
-;
-;  return random;
-;}
-define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation2
-; CHECK-SAME: (double* [[A:%.*]], i32 [[SIZE:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SIZE_ADDR:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0
-; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1
-; CHECK-NEXT:    store i64 4, i64* [[TMP10]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]]
-; CHECK:       omp_offload.failed:
-; CHECK-NEXT:    call void @heavyComputation2FallBack(i64 [[SIZE_CASTED]], double* [[A]])
-; CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
-; CHECK:       omp_offload.cont:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[CALL]], 7
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    ret i32 [[REM]]
-;
-
-
-entry:
-  %size.addr = alloca i32, align 4
-  %.offload_baseptrs = alloca [2 x i8*], align 8
-  %.offload_ptrs = alloca [2 x i8*], align 8
-  %.offload_sizes = alloca [2 x i64], align 8
-  %.offload_baseptrs2 = alloca [2 x i8*], align 8
-  %.offload_ptrs3 = alloca [2 x i8*], align 8
-
-  store i32 %size, i32* %size.addr, align 4
-  %call = tail call i32 (...) @rand()
-
-  %conv = zext i32 %size to i64
-  %0 = shl nuw nsw i64 %conv, 3
-  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
-  store i64 %0, i64* %5, align 8
-  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
-  %7 = bitcast i8** %6 to i32**
-  store i32* %size.addr, i32** %7, align 8
-  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
-  %9 = bitcast i8** %8 to i32**
-  store i32* %size.addr, i32** %9, align 8
-  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
-  store i64 4, i64* %10, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-
-  %11 = load i32, i32* %size.addr, align 4
-  %size.casted = zext i32 %11 to i64
-  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
-  %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
-  store i64 %size.casted, i64* %13, align 8
-  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
-  %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
-  store i64 %size.casted, i64* %15, align 8
-  %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
-  %17 = bitcast i8** %16 to double**
-  store double* %a, double** %17, align 8
-  %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
-  %19 = bitcast i8** %18 to double**
-  store double* %a, double** %19, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
-  %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-  %.not = icmp eq i32 %20, 0
-  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
-
-omp_offload.failed:                               ; preds = %entry
-  call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
-  br label %omp_offload.cont
-
-omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
-  %rem = srem i32 %call, 7
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-  ret i32 %rem
-}
-
-define internal void @heavyComputation2FallBack(i64 %size, double* %a) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation2FallBack
-; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[CONV]], 2
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias ptr @malloc(i64 noundef [[MUL]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER1:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       for.body.preheader1:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ [[N_VEC:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER1]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[DOTOFFLOAD_BASEPTRS]], align 8
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[DOTOFFLOAD_PTRS]], align 8
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[DOTOFFLOAD_SIZES]], align 8
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_issue(ptr nonnull @[[GLOB1:[0-9]+]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null, ptr nonnull [[HANDLE]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @global, align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    store i32 [[INC2]], ptr @global, align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(ptr nonnull @[[GLOB1]], i64 -1, ptr nonnull [[HANDLE]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(ptr nonnull @[[GLOB1]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null)
 ; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER1]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 entry:
-  ; Fallback for offloading function heavyComputation2.
+  %.offload_baseptrs = alloca [1 x ptr], align 8
+  %.offload_ptrs = alloca [1 x ptr], align 8
+  %.offload_sizes = alloca [1 x i64], align 8
+  %conv = sext i32 %N to i64
+  %mul = shl nsw i64 %conv, 2
+  %call = tail call noalias ptr @malloc(i64 noundef %mul) #3
+  %cmp8 = icmp sgt i32 %N, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %min.iters.check = icmp ult i32 %N, 4
+  br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967292
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %call, i64 %index
+  store <4 x i32> %vec.ind, ptr %0, align 4, !tbaa !8
+  %index.next = add nuw i64 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+  %1 = icmp eq i64 %index.next, %n.vec
+  br i1 %1, label %middle.block, label %vector.body, !llvm.loop !12
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
+
+for.body.preheader11:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %2 = load i32, ptr @global, align 4, !tbaa !8
+  %inc2 = add nsw i32 %2, 1
+  store i32 %inc2, ptr @global, align 4, !tbaa !8
+  store ptr %call, ptr %.offload_baseptrs, align 8
+  store ptr %call, ptr %.offload_ptrs, align 8
+  store i64 %mul, ptr %.offload_sizes, align 8
+  call void @__tgt_target_data_begin_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null)
+  call void @__tgt_target_data_end_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null)
   ret void
-}
 
-;int heavyComputation3(double* restrict a, unsigned size) {
-;  int random = rand() % 7;
-;
-;  //#pragma omp target data map(a[0:size], size)
-;  void* args[2];
-;  args[0] = &a;
-;  args[1] = &size;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  #pragma omp target teams
-;  for (int i = 0; i < size; ++i) {
-;    a[i] = ++a[i] * 3.141624;
-;  }
-;
-;  return random;
-;}
-define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation3
-; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SIZE_ADDR:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8
-; CHECK-NEXT:    store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0
-; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32**
-; CHECK-NEXT:    store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1
-; CHECK-NEXT:    store i64 4, i64* [[TMP10]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4
-; CHECK-NEXT:    [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64*
-; CHECK-NEXT:    store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]]
-; CHECK:       omp_offload.failed:
-; CHECK-NEXT:    call void @heavyComputation3FallBack(i64 [[SIZE_CASTED]], double* [[A]])
-; CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
-; CHECK:       omp_offload.cont:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[CALL]], 7
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    ret i32 [[REM]]
-;
-
-
-entry:
-  %size.addr = alloca i32, align 4
-  %.offload_baseptrs = alloca [2 x i8*], align 8
-  %.offload_ptrs = alloca [2 x i8*], align 8
-  %.offload_sizes = alloca [2 x i64], align 8
-  %.offload_baseptrs2 = alloca [2 x i8*], align 8
-  %.offload_ptrs3 = alloca [2 x i8*], align 8
-  store i32 %size, i32* %size.addr, align 4
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here.
-  %call = tail call i32 (...) @rand()
-
-  %conv = zext i32 %size to i64
-  %0 = shl nuw nsw i64 %conv, 3
-  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
-  store i64 %0, i64* %5, align 8
-  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
-  %7 = bitcast i8** %6 to i32**
-  store i32* %size.addr, i32** %7, align 8
-  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
-  %9 = bitcast i8** %8 to i32**
-  store i32* %size.addr, i32** %9, align 8
-  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
-  store i64 4, i64* %10, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-
-  %11 = load i32, i32* %size.addr, align 4
-  %size.casted = zext i32 %11 to i64
-  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
-  %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
-  store i64 %size.casted, i64* %13, align 8
-  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
-  %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
-  store i64 %size.casted, i64* %15, align 8
-  %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
-  %17 = bitcast i8** %16 to double**
-  store double* %a, double** %17, align 8
-  %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
-  %19 = bitcast i8** %18 to double**
-  store double* %a, double** %19, align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
-  %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0)
-  %.not = icmp eq i32 %20, 0
-  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
-
-omp_offload.failed:                               ; preds = %entry
-  call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
-  br label %omp_offload.cont
-
-omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
-  %rem = srem i32 %call, 7
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null)
-  ret i32 %rem
+for.body:                                         ; preds = %for.body.preheader11, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader11 ]
+  %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, ptr %arrayidx, align 4, !tbaa !8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !16
 }
 
-define internal void @heavyComputation3FallBack(i64 %size, double* %a) {
-; CHECK-LABEL: define {{[^@]+}}@heavyComputation3FallBack
-; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
-;
-entry:
-  ; Fallback for offloading function heavyComputation3.
-  ret void
-}
 
-;int dataTransferOnly1(double* restrict a, unsigned size) {
-;  // Random computation.
-;  int random = rand();
-;
-;  //#pragma omp target data map(to:a[0:size])
-;  void* args[1];
-;  args[0] = &a;
-;  __tgt_target_data_begin(..., args, ...)
-;
-;  // Random computation.
-;  random %= size;
-;  return random;
-;}
-define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
-; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1
-; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) {
+define dso_local void @test2(i32 noundef %N) local_unnamed_addr #0 {
+; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8
-; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+; CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
 ; CHECK-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @rand()
-; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double**
-; CHECK-NEXT:    store double* [[A]], double** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0
-; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[CALL]], [[SIZE]]
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]])
-; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null)
-; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[CONV]], 2
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias ptr @malloc(i64 noundef [[MUL]]) #[[ATTR3]]
+; CHECK-NEXT:    [[CMP18:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP6:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER2:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       for.body.preheader2:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ [[N_VEC:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER2]]
+; CHECK:       for.cond8.preheader.us.us.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i33
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i32 [[N]], -2
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i33
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i33 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i33 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i33 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[N]], 1001
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP8]], 1000
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[N]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER1:%.*]], label [[VECTOR_PH45:%.*]]
+; CHECK:       vector.ph45:
+; CHECK-NEXT:    [[N_VEC47:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP16]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY50:%.*]]
+; CHECK:       vector.body50:
+; CHECK-NEXT:    [[INDEX51:%.*]] = phi i64 [ 0, [[VECTOR_PH45]] ], [ [[INDEX_NEXT52:%.*]], [[VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDEX51]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP17]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INDEX_NEXT52]] = add nuw nsw i64 [[INDEX51]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT52]], [[N_VEC47]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK42:%.*]], label [[VECTOR_BODY50]], !llvm.loop [[LOOP16]]
+; CHECK:       middle.block42:
+; CHECK-NEXT:    [[CMP_N49:%.*]] = icmp eq i64 [[N_VEC47]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[CMP_N49]], label [[FOR_COND_CLEANUP6]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER1]]
+; CHECK:       for.cond8.preheader.us.us.preheader1:
+; CHECK-NEXT:    [[INDVARS_IV37_PH:%.*]] = phi i64 [ [[N_VEC47]], [[MIDDLE_BLOCK42]] ], [ 0, [[FOR_COND8_PREHEADER_US_US_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_COND8_PREHEADER_US_US:%.*]]
+; CHECK:       for.cond8.preheader.us.us:
+; CHECK-NEXT:    [[INDVARS_IV37:%.*]] = phi i64 [ [[INDVARS_IV_NEXT38:%.*]], [[FOR_COND8_PREHEADER_US_US]] ], [ [[INDVARS_IV37_PH]], [[FOR_COND8_PREHEADER_US_US_PREHEADER1]] ]
+; CHECK-NEXT:    [[ARRAYIDX20_US_US:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDVARS_IV37]]
+; CHECK-NEXT:    [[ARRAYIDX20_PROMOTED_US_US:%.*]] = load i32, ptr [[ARRAYIDX20_US_US]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[ARRAYIDX20_PROMOTED_US_US]], [[TMP16]]
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[ARRAYIDX20_US_US]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT38]] = add nuw nsw i64 [[INDVARS_IV37]], 1
+; CHECK-NEXT:    [[EXITCOND41_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT38]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND41_NOT]], label [[FOR_COND_CLEANUP6]], label [[FOR_COND8_PREHEADER_US_US]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER2]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       for.cond.cleanup6:
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[DOTOFFLOAD_BASEPTRS]], align 8
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[DOTOFFLOAD_PTRS]], align 8
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[DOTOFFLOAD_SIZES]], align 8
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_issue(ptr nonnull @[[GLOB1]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null, ptr nonnull [[HANDLE]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr @global, align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[INC31:%.*]] = add nsw i32 [[TMP22]], 1
+; CHECK-NEXT:    [[CMP32:%.*]] = icmp sgt i32 [[TMP22]], 0
+; CHECK-NEXT:    br i1 [[CMP32]], label [[IF_THEN:%.*]], label [[IF_END44:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INC34:%.*]] = add nuw nsw i32 [[TMP22]], 2
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ugt i32 [[TMP22]], 4
+; CHECK-NEXT:    br i1 [[CMP35]], label [[IF_THEN37:%.*]], label [[IF_END44]]
+; CHECK:       if.then37:
+; CHECK-NEXT:    [[CMP39:%.*]] = icmp ult i32 [[TMP22]], 8
+; CHECK-NEXT:    [[SPEC_SELECT_V:%.*]] = select i1 [[CMP39]], i32 4, i32 3
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = add nuw nsw i32 [[SPEC_SELECT_V]], [[TMP22]]
+; CHECK-NEXT:    br label [[IF_END44]]
+; CHECK:       if.end44:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i32 [ [[INC34]], [[IF_THEN]] ], [ [[INC31]], [[FOR_COND_CLEANUP6]] ], [ [[SPEC_SELECT]], [[IF_THEN37]] ]
+; CHECK-NEXT:    [[INC45:%.*]] = add nsw i32 [[TMP23]], 1
+; CHECK-NEXT:    store i32 [[INC45]], ptr @global, align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(ptr nonnull @[[GLOB1]], i64 -1, ptr nonnull [[HANDLE]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(ptr nonnull @[[GLOB1]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null)
+; CHECK-NEXT:    ret void
 ;
-
-
-
-
-
-
 entry:
-  %.offload_baseptrs = alloca [1 x i8*], align 8
-  %.offload_ptrs = alloca [1 x i8*], align 8
+  %.offload_baseptrs = alloca [1 x ptr], align 8
+  %.offload_ptrs = alloca [1 x ptr], align 8
   %.offload_sizes = alloca [1 x i64], align 8
-
-  ; FIXME: call to @__tgt_target_data_begin_issue_mapper(...) should be moved here.
-  %call = tail call i32 (...) @rand()
-
-  %conv = zext i32 %size to i64
-  %0 = shl nuw nsw i64 %conv, 3
-  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
-  %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
-  store double* %a, double** %2, align 8
-  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
-  %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
-  store double* %a, double** %4, align 8
-  %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
-  store i64 %0, i64* %5, align 8
-  call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null)
-
-  %rem = urem i32 %call, %size
-
-  call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null)
-  ret i32 %rem
+  %conv = sext i32 %N to i64
+  %mul = shl nsw i64 %conv, 2
+  %call = tail call noalias ptr @malloc(i64 noundef %mul) #3
+  %cmp18 = icmp sgt i32 %N, 0
+  br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup6
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %min.iters.check = icmp ult i32 %N, 4
+  br i1 %min.iters.check, label %for.body.preheader54, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967292
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %call, i64 %index
+  store <4 x i32> %vec.ind, ptr %0, align 4, !tbaa !8
+  %index.next = add nuw i64 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+  %1 = icmp eq i64 %index.next, %n.vec
+  br i1 %1, label %middle.block, label %vector.body, !llvm.loop !12
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond3.preheader, label %for.body.preheader54
+
+for.body.preheader54:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.cond3.preheader:                              ; preds = %for.body, %middle.block
+  br i1 %cmp18, label %for.cond8.preheader.us.us.preheader, label %for.cond.cleanup6
+
+for.cond8.preheader.us.us.preheader:              ; preds = %for.cond3.preheader
+  %2 = add nsw i32 %N, -1
+  %3 = zext i32 %2 to i33
+  %4 = add nsw i32 %N, -2
+  %5 = zext i32 %4 to i33
+  %6 = mul i33 %3, %5
+  %7 = lshr i33 %6, 1
+  %8 = trunc i33 %7 to i32
+  %9 = mul i32 %N, 1001
+  %10 = add i32 %9, %8
+  %11 = add i32 %10, -1
+  %12 = mul i32 %11, %2
+  %13 = add i32 %8, 1000
+  %14 = mul i32 %13, %N
+  %15 = add i32 %12, %14
+  %16 = add i32 %15, %8
+  %17 = add i32 %16, -1
+  %wide.trip.count40 = zext i32 %N to i64
+  %min.iters.check44 = icmp ult i32 %N, 4
+  br i1 %min.iters.check44, label %for.cond8.preheader.us.us.preheader53, label %vector.ph45
+
+vector.ph45:                                      ; preds = %for.cond8.preheader.us.us.preheader
+  %n.vec47 = and i64 %wide.trip.count, 4294967292
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %17, i64 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body50
+
+vector.body50:                                    ; preds = %vector.body50, %vector.ph45
+  %index51 = phi i64 [ 0, %vector.ph45 ], [ %index.next52, %vector.body50 ]
+  %18 = getelementptr inbounds i32, ptr %call, i64 %index51
+  %wide.load = load <4 x i32>, ptr %18, align 4, !tbaa !8
+  %19 = add <4 x i32> %broadcast.splat, %wide.load
+  store <4 x i32> %19, ptr %18, align 4, !tbaa !8
+  %index.next52 = add nuw i64 %index51, 4
+  %20 = icmp eq i64 %index.next52, %n.vec47
+  br i1 %20, label %middle.block42, label %vector.body50, !llvm.loop !16
+
+middle.block42:                                   ; preds = %vector.body50
+  %cmp.n49 = icmp eq i64 %n.vec47, %wide.trip.count
+  br i1 %cmp.n49, label %for.cond.cleanup6, label %for.cond8.preheader.us.us.preheader53
+
+for.cond8.preheader.us.us.preheader53:            ; preds = %for.cond8.preheader.us.us.preheader, %middle.block42
+  %indvars.iv37.ph = phi i64 [ 0, %for.cond8.preheader.us.us.preheader ], [ %n.vec47, %middle.block42 ]
+  br label %for.cond8.preheader.us.us
+
+for.cond8.preheader.us.us:                        ; preds = %for.cond8.preheader.us.us.preheader53, %for.cond8.preheader.us.us
+  %indvars.iv37 = phi i64 [ %indvars.iv.next38, %for.cond8.preheader.us.us ], [ %indvars.iv37.ph, %for.cond8.preheader.us.us.preheader53 ]
+  %arrayidx20.us.us = getelementptr inbounds i32, ptr %call, i64 %indvars.iv37
+  %arrayidx20.promoted.us.us = load i32, ptr %arrayidx20.us.us, align 4, !tbaa !8
+  %21 = add i32 %17, %arrayidx20.promoted.us.us
+  store i32 %21, ptr %arrayidx20.us.us, align 4, !tbaa !8
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %exitcond41.not = icmp eq i64 %indvars.iv.next38, %wide.trip.count40
+  br i1 %exitcond41.not, label %for.cond.cleanup6, label %for.cond8.preheader.us.us, !llvm.loop !17
+
+for.body:                                         ; preds = %for.body.preheader54, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader54 ]
+  %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv
+  %22 = trunc i64 %indvars.iv to i32
+  store i32 %22, ptr %arrayidx, align 4, !tbaa !8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond3.preheader, label %for.body, !llvm.loop !18
+
+for.cond.cleanup6:                                ; preds = %for.cond8.preheader.us.us, %middle.block42, %entry, %for.cond3.preheader
+  %23 = load i32, ptr @global, align 4, !tbaa !8
+  %inc31 = add nsw i32 %23, 1
+  %cmp32 = icmp sgt i32 %23, 0
+  br i1 %cmp32, label %if.then, label %if.end44
+
+if.then:                                          ; preds = %for.cond.cleanup6
+  %inc34 = add nuw nsw i32 %23, 2
+  %cmp35 = icmp ugt i32 %23, 4
+  br i1 %cmp35, label %if.then37, label %if.end44
+
+if.then37:                                        ; preds = %if.then
+  %cmp39 = icmp ult i32 %23, 8
+  %spec.select.v = select i1 %cmp39, i32 4, i32 3
+  %spec.select = add nuw nsw i32 %23, %spec.select.v
+  br label %if.end44
+
+if.end44:                                         ; preds = %if.then37, %if.then, %for.cond.cleanup6
+  %24 = phi i32 [ %inc34, %if.then ], [ %inc31, %for.cond.cleanup6 ], [ %spec.select, %if.then37 ]
+  %inc45 = add nsw i32 %24, 1
+  store i32 %inc45, ptr @global, align 4, !tbaa !8
+  store ptr %call, ptr %.offload_baseptrs, align 8
+  store ptr %call, ptr %.offload_ptrs, align 8
+  store i64 %mul, ptr %.offload_sizes, align 8
+  call void @__tgt_target_data_begin_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null)
+  call void @__tgt_target_data_end_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null)
+  ret void
 }
 
-declare void @__tgt_target_data_begin_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**)
-declare i32 @__tgt_target_teams_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**, i32, i32)
-declare void @__tgt_target_data_end_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**)
-
-declare dso_local i32 @rand(...)
 
-; CHECK: declare void @__tgt_target_data_begin_mapper_issue(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**, %struct.__tgt_async_info*)
-; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info*)
 
-!llvm.module.flags = !{!0}
+; Function Attrs: inaccessiblememonly mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0)
+declare noalias noundef ptr @malloc(i64 noundef) local_unnamed_addr #1
+
+; Function Attrs: nounwind
+declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @__tgt_target_data_end_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) local_unnamed_addr #2
+
+attributes #0 = { nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { inaccessiblememonly mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) "alloc-family"="malloc" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind allocsize(0) }
+
+!llvm.module.flags = !{!1, !2, !3, !4, !5}
+!llvm.ident = !{!6}
+!llvm.embedded.objects = !{!7}
+!nvvm.annotations = !{}
+
+!0 = !{}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 7, !"openmp", i32 50}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git cf1d9a1fdca258ab56f3060dfa4a303b8127350e)"}
+!7 = !{ptr @llvm.embedded.object, !".llvm.offloading"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"int", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
+!12 = distinct !{!12, !13, !14, !15}
+!13 = !{!"llvm.loop.mustprogress"}
+!14 = !{!"llvm.loop.unroll.disable"}
+!15 = !{!"llvm.loop.isvectorized", i32 1}
+!16 = distinct !{!16, !13, !14, !15}
+!17 = distinct !{!17, !13, !14, !15}
+!18 = distinct !{!18, !13, !14, !15}
 
-!0 = !{i32 7, !"openmp", i32 50}