Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
===================================================================
--- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,9 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
-; RUN: opt -S -passes=openmpopt < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature --scrub-attributes
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa < %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 ; FIXME: This struct should be generated after splitting at least one of the runtime calls.
 ; %struct.__tgt_async_info = type { i8* }
+
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
 
@@ -46,18 +47,18 @@
 ; CHECK-NEXT:    %.offload_baseptrs4 = alloca [1 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs5 = alloca [1 x i8*], align 8
 ; CHECK-NEXT:    %0 = bitcast double* %a to i8*
-; CHECK-NEXT:    %call = call i32 @rand()
+; CHECK-NEXT:    %call = tail call i32 (...) @rand()
 ; CHECK-NEXT:    %rem = srem i32 %call, 777
 ; CHECK-NEXT:    %conv = sitofp i32 %rem to double
 ; CHECK-NEXT:    store double %conv, double* %a, align 8
-; CHECK-NEXT:    %call1 = call i32 @rand()
+; CHECK-NEXT:    %call1 = tail call i32 (...) @rand()
 ; CHECK-NEXT:    %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
 ; CHECK-NEXT:    %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
 ; CHECK-NEXT:    store double* %a, double** %2, align 8
 ; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
 ; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    %5 = bitcast double* %a to i64*
 ; CHECK-NEXT:    %6 = load i64, i64* %5, align 8
 ; CHECK-NEXT:    %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
@@ -66,17 +67,17 @@
 ; CHECK-NEXT:    %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0
 ; CHECK-NEXT:    %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
 ; CHECK-NEXT:    store i64 %6, i64* %10, align 8
-; CHECK-NEXT:    %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
-; CHECK-NEXT:    %12 = icmp eq i32 %11, 0
-; CHECK-NEXT:    br i1 %12, label %omp_offload.cont, label %omp_offload.failed
+; CHECK-NEXT:    %11 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i32 0, i32 0)
+; CHECK-NEXT:    %.not = icmp eq i32 %11, 0
+; CHECK-NEXT:    br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
 ; CHECK:       omp_offload.failed:
 ; CHECK-NEXT:    call void @heavyComputation1FallBack(i64 %6)
 ; CHECK-NEXT:    br label %omp_offload.cont
 ; CHECK:       omp_offload.cont:
 ; CHECK-NEXT:    %conv2 = sitofp i32 %call1 to double
-; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
-; CHECK-NEXT:    %13 = load double, double* %a, align 8
-; CHECK-NEXT:    %add = fadd double %13, %conv2
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    %12 = load double, double* %a, align 8
+; CHECK-NEXT:    %add = fadd double %12, %conv2
 ; CHECK-NEXT:    ret double %add
 ;
 entry:
@@ -86,39 +87,27 @@
   %.offload_baseptrs4 = alloca [1 x i8*], align 8
   %.offload_ptrs5 = alloca [1 x i8*], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
   %0 = bitcast double* %a to i8*
-  %call = call i32 @rand()
+  %call = tail call i32 (...) @rand()
   %rem = srem i32 %call, 777
   %conv = sitofp i32 %rem to double
   store double %conv, double* %a, align 8
 
-  ; FIXME: The "isue" should be moved here.
-  %call1 = call i32 @rand()
+  ; FIXME: call to @__tgt_target_data_begin_mapper_issue(...) should be moved here.
+  %call1 = tail call i32 (...) @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. The call should be replaced to something
-  ;        like ...
-  ; Issue - this is moved upwards.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
-  ; Wait - this is moved downwards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
   %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
   store double* %a, double** %2, align 8
   %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
   %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
   store double* %a, double** %4, align 8
-  call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
+  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
+  ;        split into its "issue" and "wait" counterpars and moved upwards
+  ;        and downwards, respectively.
+  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
+  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+  call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
 
   %5 = bitcast double* %a to i64*
   %6 = load i64, i64* %5, align 8
@@ -129,21 +118,20 @@
   %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
   store i64 %6, i64* %10, align 8
 
-  ; FIXME: The "wait" should be moved here.
-  %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
-
-  %12 = icmp eq i32 %11, 0
-  br i1 %12, label %omp_offload.cont, label %omp_offload.failed
+  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
+  %11 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i32 0, i32 0)
+  %.not = icmp eq i32 %11, 0
+  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
 
 omp_offload.failed:                               ; preds = %entry
   call void @heavyComputation1FallBack(i64 %6)
   br label %omp_offload.cont
 
-omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
   %conv2 = sitofp i32 %call1 to double
-  call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
-  %13 = load double, double* %a, align 8
-  %add = fadd double %13, %conv2
+  call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+  %12 = load double, double* %a, align 8
+  %add = fadd double %12, %conv2
   ret double %add
 }
 
@@ -179,7 +167,7 @@
 ; CHECK-NEXT:    %.offload_baseptrs2 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs3 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    store i32 %size, i32* %size.addr, align 4
-; CHECK-NEXT:    %call = call i32 @rand()
+; CHECK-NEXT:    %call = tail call i32 (...) @rand()
 ; CHECK-NEXT:    %conv = zext i32 %size to i64
 ; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
 ; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -198,7 +186,7 @@
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -213,15 +201,15 @@
 ; CHECK-NEXT:    %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
 ; CHECK-NEXT:    %19 = bitcast i8** %18 to double**
 ; CHECK-NEXT:    store double* %a, double** %19, align 8
-; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
-; CHECK-NEXT:    %21 = icmp eq i32 %20, 0
-; CHECK-NEXT:    br i1 %21, label %omp_offload.cont, label %omp_offload.failed
+; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i32 0, i32 0)
+; CHECK-NEXT:    %.not = icmp eq i32 %20, 0
+; CHECK-NEXT:    br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
 ; CHECK:       omp_offload.failed:
 ; CHECK-NEXT:    call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
 ; CHECK-NEXT:    br label %omp_offload.cont
 ; CHECK:       omp_offload.cont:
 ; CHECK-NEXT:    %rem = srem i32 %call, 7
-; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    ret i32 %rem
 ;
 entry:
@@ -232,27 +220,9 @@
   %.offload_baseptrs2 = alloca [2 x i8*], align 8
   %.offload_ptrs3 = alloca [2 x i8*], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
   store i32 %size, i32* %size.addr, align 4
-  %call = call i32 @rand()
-
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars. Here though, the "issue"
-  ;        cannot be moved upwards because it's not guaranteed that rand()
-  ;        won't modify *a. Nevertheless, the "wait" can be moved downwards.
-  ;        The call should be replaced to something like ...
-  ; Issue - this can't be moved upwards, *a might have aliases.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
-  ; Wait - this is moved downards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
+  %call = tail call i32 (...) @rand()
+
   %conv = zext i32 %size to i64
   %0 = shl nuw nsw i64 %conv, 3
   %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -271,7 +241,13 @@
   store i32* %size.addr, i32** %9, align 8
   %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
   store i64 4, i64* %10, align 8
-  call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
+  ;        split into its "issue" and "wait" counterpars and moved upwards
+  ;        and downwards, respectively. Here though, the "issue" cannot be moved upwards
+  ;        because it's not guaranteed that rand() won't modify *a.
+  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
+  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+  call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 
   %11 = load i32, i32* %size.addr, align 4
   %size.casted = zext i32 %11 to i64
@@ -288,19 +264,18 @@
   %19 = bitcast i8** %18 to double**
   store double* %a, double** %19, align 8
 
-  ; FIXME: The "wait" should be moved here.
-  %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
-
-  %21 = icmp eq i32 %20, 0
-  br i1 %21, label %omp_offload.cont, label %omp_offload.failed
+  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
+  %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i32 0, i32 0)
+  %.not = icmp eq i32 %20, 0
+  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
 
 omp_offload.failed:                               ; preds = %entry
   call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
   br label %omp_offload.cont
 
-omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
   %rem = srem i32 %call, 7
-  call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+  call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
   ret i32 %rem
 }
 
@@ -336,7 +311,7 @@
 ; CHECK-NEXT:    %.offload_baseptrs2 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs3 = alloca [2 x i8*], align 8
 ; CHECK-NEXT:    store i32 %size, i32* %size.addr, align 4
-; CHECK-NEXT:    %call = call i32 @rand()
+; CHECK-NEXT:    %call = tail call i32 (...) @rand()
 ; CHECK-NEXT:    %conv = zext i32 %size to i64
 ; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
 ; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -355,7 +330,7 @@
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -370,15 +345,15 @@
 ; CHECK-NEXT:    %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
 ; CHECK-NEXT:    %19 = bitcast i8** %18 to double**
 ; CHECK-NEXT:    store double* %a, double** %19, align 8
-; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
-; CHECK-NEXT:    %21 = icmp eq i32 %20, 0
-; CHECK-NEXT:    br i1 %21, label %omp_offload.cont, label %omp_offload.failed
+; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i32 0, i32 0)
+; CHECK-NEXT:    %.not = icmp eq i32 %20, 0
+; CHECK-NEXT:    br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
 ; CHECK:       omp_offload.failed:
 ; CHECK-NEXT:    call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
 ; CHECK-NEXT:    br label %omp_offload.cont
 ; CHECK:       omp_offload.cont:
 ; CHECK-NEXT:    %rem = srem i32 %call, 7
-; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    ret i32 %rem
 ;
 entry:
@@ -388,29 +363,11 @@
   %.offload_sizes = alloca [2 x i64], align 8
   %.offload_baseptrs2 = alloca [2 x i8*], align 8
   %.offload_ptrs3 = alloca [2 x i8*], align 8
-
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
   store i32 %size, i32* %size.addr, align 4
 
-  ; FIXME: The "issue" should be moved here.
-  %call = call i32 @rand()
+  ; FIXME: call to @__tgt_target_data_begin_mapper_issue(...) should be moved here.
+  %call = tail call i32 (...) @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. The call should be replaced to something
-  ;        like ...
-  ; Issue - this is moved upwards.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
-  ; Wait - this is moved downards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %conv = zext i32 %size to i64
   %0 = shl nuw nsw i64 %conv, 3
   %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -429,7 +386,12 @@
   store i32* %size.addr, i32** %9, align 8
   %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
   store i64 4, i64* %10, align 8
-  call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
+  ;        split into its "issue" and "wait" counterpars and moved upwards
+  ;        and downwards, respectively.
+  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
+  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+  call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 
   %11 = load i32, i32* %size.addr, align 4
   %size.casted = zext i32 %11 to i64
@@ -446,19 +408,18 @@
   %19 = bitcast i8** %18 to double**
   store double* %a, double** %19, align 8
 
-  ; FIXME: The "wait" should be moved here.
-  %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
-
-  %21 = icmp eq i32 %20, 0
-  br i1 %21, label %omp_offload.cont, label %omp_offload.failed
+  ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here.
+  %20 = call i32 @__tgt_target_teams_mapper(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i32 0, i32 0)
+  %.not = icmp eq i32 %20, 0
+  br i1 %.not, label %omp_offload.cont, label %omp_offload.failed
 
 omp_offload.failed:                               ; preds = %entry
   call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
   br label %omp_offload.cont
 
-omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
   %rem = srem i32 %call, 7
-  call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
+  call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
   ret i32 %rem
 }
 
@@ -487,7 +448,7 @@
 ; CHECK-NEXT:    %.offload_baseptrs = alloca [1 x i8*], align 8
 ; CHECK-NEXT:    %.offload_ptrs = alloca [1 x i8*], align 8
 ; CHECK-NEXT:    %.offload_sizes = alloca [1 x i64], align 8
-; CHECK-NEXT:    %call = call i32 @rand()
+; CHECK-NEXT:    %call = tail call i32 (...) @rand()
 ; CHECK-NEXT:    %conv = zext i32 %size to i64
 ; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
 ; CHECK-NEXT:    %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -498,9 +459,9 @@
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
 ; CHECK-NEXT:    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
 ; CHECK-NEXT:    store i64 %0, i64* %5, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    %rem = urem i32 %call, %size
-; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
+; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    ret i32 %rem
 ;
 entry:
@@ -508,26 +469,9 @@
   %.offload_ptrs = alloca [1 x i8*], align 8
   %.offload_sizes = alloca [1 x i64], align 8
 
-  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
-  ; %device_id1 = alloca i64, align 8
-  ; %async_info1 = alloca %struct.__tgt_async_info, align 8
-
-  ; FIXME: The "issue" should be moved here.
-  %call = call i32 @rand()
+  ; FIXME: call to @__tgt_target_data_begin_issue_mapper(...) should be moved here.
+  %call = tail call i32 (...) @rand()
 
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. The call should be replaced to something
-  ;        like ...
-  ; Issue - this is moved upwards.
-  ; ... setup code ...
-  ; store i64 -1, i64* %device_id1, align 8
-  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
-  ; Wait - this is moved downards.
-  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
-  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
-  ; store i8* %handle1, i8** %queue1, align 8
-  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
   %conv = zext i32 %size to i64
   %0 = shl nuw nsw i64 %conv, 3
   %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
@@ -538,24 +482,28 @@
   store double* %a, double** %4, align 8
   %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
   store i64 %0, i64* %5, align 8
-  call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
+  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
+  ;        split into its "issue" and "wait" counterpars and moved upwards
+  ;        and downwards, respectively. Here though, the "wait" cannot be moved downwards
+  ;        because it is not worthit. That is, there is no store nor call to be hoisted
+  ;        over.
+  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
+  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
+  call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 
   %rem = urem i32 %call, %size
 
-  ; FIXME: The "wait" should be moved here.
-  call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
+  call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
   ret i32 %rem
 }
 
-declare dso_local void @__tgt_target_data_begin(i64, i32, i8**, i8**, i64*, i64*)
-
-declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
-
-declare dso_local void @__tgt_target_data_end(i64, i32, i8**, i8**, i64*, i64*)
+declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
+declare i32 @__tgt_target_teams_mapper(i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i32, i32)
+declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
 
-declare dso_local i32 @rand()
+declare dso_local i32 @rand(...)
 
 ; FIXME: These two function declarations must be generated after splitting the runtime function
-;        __tgt_target_data_begin.
-; declare dso_local i8* @__tgt_target_data_begin_issue(i64* dereferenceable(8), i32, i8**, i8**, i64*, i64*)
-; declare dso_local void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info* dereferenceable(8))
+;        __tgt_target_data_begin_mapper.
+; declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
+; declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)