Diff 275563

llvm/test/Transforms/OpenMP/mem_transfer_hiding.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
				; RUN: opt -S -passes=openmpopt < %s \| FileCheck %s
				target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
				jdoerfertUnsubmitted Not Done Reply Inline Actions Why do you want to run this with stats? I guess later you just use filecheck and verify the IR, right? jdoerfert: Why do you want to run this with stats? I guess later you just use filecheck and verify the IR…
				hamax97AuthorUnsubmitted Done Reply Inline Actions I thought it would be useful to perhaps count the number of split runtime calls, and how much they were moved forward and backwards. What do you think?. I have no problem not doing it. hamax97: I thought it would be useful to perhaps count the number of split runtime calls, and how much…
				jdoerfertUnsubmitted Not Done Reply Inline Actions We will eventually check the resulting IR which contains all the information. We basically compare against the expected splits and movements so any derivation needs to be flagged. That said, having a test case to verify our statistic tracking works and one to verify the remarks (=feedback to the user) work, is important. Though, both should be separate test cases with appropriate names. jdoerfert: We will eventually check the resulting IR which contains all the information. We basically…
				jdoerfertUnsubmitted Not Done Reply Inline Actions No error output and asserts needed. We should however already use filecheck to check for the outut we care about. You can use `llvm/utils/update_test_checks.py` on this file. Once we optimize this we can nicely see the effect in the diff. jdoerfert: No error output and asserts needed. We should however already use filecheck to check for the…

				; FIXME: This struct should be generated after splitting at least one of the runtime calls.
				; %struct.__tgt_async_info = type { i8* }
				%struct.ident_t = type { i32, i32, i32, i32, i8* }
				%struct.__tgt_offload_entry = type { i8, i8, i64, i32, i32 }

				@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
				@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0
				@.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8]
				@.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800]

				@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0
				@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35]

				@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0
				@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0]
				@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544]

				@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33]

				;double heavyComputation1() {
				; double a = rand() % 777;
				; double random = rand();
				;
				; //#pragma omp target data map(a)
				; void* args[1];
				; args[0] = &a;
				; __tgt_target_data_begin(..., args, ...)
				;
				; #pragma omp target teams
				; for (int i = 0; i < 1000; ++i) {
				; a = ii / 2;
				; }
				;
				; return random + a;
				;}
				define dso_local double @heavyComputation1() {
				; CHECK-LABEL: define {{[^@]+}}@heavyComputation1()
				; CHECK-NEXT: entry:
				; CHECK-NEXT: %a = alloca double, align 8
				; CHECK-NEXT: %.offload_baseptrs = alloca [1 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs = alloca [1 x i8*], align 8
				; CHECK-NEXT: %.offload_baseptrs4 = alloca [1 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs5 = alloca [1 x i8*], align 8
				; CHECK-NEXT: %0 = bitcast double* %a to i8*
				; CHECK-NEXT: %call = call i32 @rand()
				; CHECK-NEXT: %rem = srem i32 %call, 777
				; CHECK-NEXT: %conv = sitofp i32 %rem to double
				; CHECK-NEXT: store double %conv, double* %a, align 8
				; CHECK-NEXT: %call1 = call i32 @rand()
				; CHECK-NEXT: %1 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_baseptrs, i64 0, i64 0
				; CHECK-NEXT: %2 = bitcast [1 x i8] %.offload_baseptrs to double**
				; CHECK-NEXT: store double* %a, double** %2, align 8
				; CHECK-NEXT: %3 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_ptrs, i64 0, i64 0
				; CHECK-NEXT: %4 = bitcast [1 x i8] %.offload_ptrs to double**
				; CHECK-NEXT: store double* %a, double** %4, align 8
				; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
				; CHECK-NEXT: %5 = bitcast double* %a to i64*
				; CHECK-NEXT: %6 = load i64, i64* %5, align 8
				; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_baseptrs4, i64 0, i64 0
				; CHECK-NEXT: %8 = bitcast [1 x i8] %.offload_baseptrs4 to i64*
				; CHECK-NEXT: store i64 %6, i64* %8, align 8
				; CHECK-NEXT: %9 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_ptrs5, i64 0, i64 0
				; CHECK-NEXT: %10 = bitcast [1 x i8] %.offload_ptrs5 to i64*
				; CHECK-NEXT: store i64 %6, i64* %10, align 8
				; CHECK-NEXT: %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8 nonnull %7, i8 nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
				; CHECK-NEXT: %12 = icmp eq i32 %11, 0
				; CHECK-NEXT: br i1 %12, label %omp_offload.cont, label %omp_offload.failed
				; CHECK: omp_offload.failed:
				; CHECK-NEXT: call void @heavyComputation1FallBack(i64 %6)
				; CHECK-NEXT: br label %omp_offload.cont
				; CHECK: omp_offload.cont:
				; CHECK-NEXT: %conv2 = sitofp i32 %call1 to double
				; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
				; CHECK-NEXT: %13 = load double, double* %a, align 8
				; CHECK-NEXT: %add = fadd double %13, %conv2
				; CHECK-NEXT: ret double %add
				;
				entry:
				%a = alloca double, align 8
				%.offload_baseptrs = alloca [1 x i8*], align 8
				%.offload_ptrs = alloca [1 x i8*], align 8
				%.offload_baseptrs4 = alloca [1 x i8*], align 8
				%.offload_ptrs5 = alloca [1 x i8*], align 8

				; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
				; %device_id1 = alloca i64, align 8
				; %async_info1 = alloca %struct.__tgt_async_info, align 8

				%0 = bitcast double* %a to i8*
				%call = call i32 @rand()
				%rem = srem i32 %call, 777
				%conv = sitofp i32 %rem to double
				store double %conv, double* %a, align 8

				; FIXME: The "isue" should be moved here.
				%call1 = call i32 @rand()

				; FIXME: This setup for the runtime call __tgt_target_data_begin should be
				; split into its "issue" and "wait" counterpars and moved upwards
				; and downwards, respectively. The call should be replaced to something
				; like ...
				; Issue - this is moved upwards.
				; ... setup code ...
				; store i64 -1, i64* %device_id1, align 8
				; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
				; Wait - this is moved downwards.
				; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
				; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
				; store i8* %handle1, i8** %queue1, align 8
				; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
				%1 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_baseptrs, i64 0, i64 0
				%2 = bitcast [1 x i8] %.offload_baseptrs to double**
				store double* %a, double** %2, align 8
				%3 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_ptrs, i64 0, i64 0
				%4 = bitcast [1 x i8] %.offload_ptrs to double**
				store double* %a, double** %4, align 8
				call void @__tgt_target_data_begin(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))

				%5 = bitcast double* %a to i64*
				%6 = load i64, i64* %5, align 8
				%7 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_baseptrs4, i64 0, i64 0
				%8 = bitcast [1 x i8] %.offload_baseptrs4 to i64*
				store i64 %6, i64* %8, align 8
				%9 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_ptrs5, i64 0, i64 0
				%10 = bitcast [1 x i8] %.offload_ptrs5 to i64*
				store i64 %6, i64* %10, align 8

				; FIXME: The "wait" should be moved here.
				%11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8 nonnull %7, i8 nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)

				%12 = icmp eq i32 %11, 0
				br i1 %12, label %omp_offload.cont, label %omp_offload.failed

				omp_offload.failed: ; preds = %entry
				call void @heavyComputation1FallBack(i64 %6)
				br label %omp_offload.cont

				omp_offload.cont: ; preds = %entry, %omp_offload.failed
				%conv2 = sitofp i32 %call1 to double
				call void @__tgt_target_data_end(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
				%13 = load double, double* %a, align 8
				%add = fadd double %13, %conv2
				jdoerfertUnsubmitted Not Done Reply Inline Actions For the issue to be hoisted above the `rand` call the pointer needs to be noalias or we need better information about `rand`. As it is, `rand` might modify `a` which prevents moving. jdoerfert: For the issue to be hoisted above the `rand` call the pointer needs to be noalias or we need…
				hamax97AuthorUnsubmitted Done Reply Inline Actions Oh yes, you're right. So, this test case is only useful if `a` is `noalias`, or I could add some code between the `target data map()` and the `target teams` which would enable moving the `wait` function. This latter way would avoid the use of `noalias` which is not commonly used I guess. But maybe it's a good idea to have a test case where `noalias` is used, it would be a nice optimization. hamax97:* Oh yes, you're right. So, this test case is only useful if `*a` is `noalias`, or I could add…
				jdoerfertUnsubmitted Not Done Reply Inline Actions The test case is useful, to check we do not accidentally hoist above something we are not allowed to. (=negative test) You want tests with interesting code on both sides to verify movement in both directions, yes. We will need more tests for coverage later. Noalias is not uncommon either. People use `__restrict/restrict` and the Attributor can derive `noalias` even without. Generally speaking, we want all kinds of test cases to cover all kinds of situations. Usual approach, copy this function and add `noalias` to one of them ;) jdoerfert: The test case is useful, to check we do not accidentally hoist above something we are not…
				ret double %add
				}

				define internal void @heavyComputation1FallBack(i64 %a) {
				entry:
				; Fallback for offloading function heavyComputation1.
				ret void
				}

				;int heavyComputation2(double* a, unsigned size) {
				; int random = rand() % 7;
				;
				; //#pragma omp target data map(a[0:size], size)
				; void* args[2];
				; args[0] = &a;
				; args[1] = &size;
				; __tgt_target_data_begin(..., args, ...)
				;
				; #pragma omp target teams
				; for (int i = 0; i < size; ++i) {
				; a[i] = ++a[i] * 3.141624;
				; }
				;
				; return random;
				;}
				define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
				; CHECK-LABEL: define {{[^@]+}}@heavyComputation2(double* %a, i32 %size)
				; CHECK-NEXT: entry:
				; CHECK-NEXT: %size.addr = alloca i32, align 4
				; CHECK-NEXT: %.offload_baseptrs = alloca [2 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs = alloca [2 x i8*], align 8
				; CHECK-NEXT: %.offload_sizes = alloca [2 x i64], align 8
				; CHECK-NEXT: %.offload_baseptrs2 = alloca [2 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs3 = alloca [2 x i8*], align 8
				; CHECK-NEXT: store i32 %size, i32* %size.addr, align 4
				; CHECK-NEXT: %call = call i32 @rand()
				; CHECK-NEXT: %conv = zext i32 %size to i64
				; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3
				; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 0
				; CHECK-NEXT: %2 = bitcast [2 x i8] %.offload_baseptrs to double**
				; CHECK-NEXT: store double* %a, double** %2, align 8
				; CHECK-NEXT: %3 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 0
				; CHECK-NEXT: %4 = bitcast [2 x i8] %.offload_ptrs to double**
				; CHECK-NEXT: store double* %a, double** %4, align 8
				; CHECK-NEXT: %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
				; CHECK-NEXT: store i64 %0, i64* %5, align 8
				; CHECK-NEXT: %6 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 1
				; CHECK-NEXT: %7 = bitcast i8 %6 to i32
				; CHECK-NEXT: store i32* %size.addr, i32** %7, align 8
				; CHECK-NEXT: %8 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 1
				; CHECK-NEXT: %9 = bitcast i8 %8 to i32
				; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8
				; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
				; CHECK-NEXT: store i64 4, i64* %10, align 8
				; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
				; CHECK-NEXT: %size.casted = zext i32 %11 to i64
				; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 0
				; CHECK-NEXT: %13 = bitcast [2 x i8] %.offload_baseptrs2 to i64*
				; CHECK-NEXT: store i64 %size.casted, i64* %13, align 8
				; CHECK-NEXT: %14 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 0
				; CHECK-NEXT: %15 = bitcast [2 x i8] %.offload_ptrs3 to i64*
				; CHECK-NEXT: store i64 %size.casted, i64* %15, align 8
				; CHECK-NEXT: %16 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 1
				; CHECK-NEXT: %17 = bitcast i8 %16 to double
				; CHECK-NEXT: store double* %a, double** %17, align 8
				; CHECK-NEXT: %18 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 1
				; CHECK-NEXT: %19 = bitcast i8 %18 to double
				; CHECK-NEXT: store double* %a, double** %19, align 8
				; CHECK-NEXT: %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8 nonnull %12, i8 nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
				; CHECK-NEXT: %21 = icmp eq i32 %20, 0
				; CHECK-NEXT: br i1 %21, label %omp_offload.cont, label %omp_offload.failed
				; CHECK: omp_offload.failed:
				; CHECK-NEXT: call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
				; CHECK-NEXT: br label %omp_offload.cont
				; CHECK: omp_offload.cont:
				; CHECK-NEXT: %rem = srem i32 %call, 7
				; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				; CHECK-NEXT: ret i32 %rem
				;
				entry:
				%size.addr = alloca i32, align 4
				%.offload_baseptrs = alloca [2 x i8*], align 8
				%.offload_ptrs = alloca [2 x i8*], align 8
				%.offload_sizes = alloca [2 x i64], align 8
				%.offload_baseptrs2 = alloca [2 x i8*], align 8
				%.offload_ptrs3 = alloca [2 x i8*], align 8

				; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
				; %device_id1 = alloca i64, align 8
				; %async_info1 = alloca %struct.__tgt_async_info, align 8

				store i32 %size, i32* %size.addr, align 4
				%call = call i32 @rand()

				; FIXME: This setup for the runtime call __tgt_target_data_begin should be
				; split into its "issue" and "wait" counterpars. Here though, the "issue"
				; cannot be moved upwards because it's not guaranteed that rand()
				; won't modify *a. Nevertheless, the "wait" can be moved downwards.
				; The call should be replaced to something like ...
				; Issue - this can't be moved upwards, *a might have aliases.
				; ... setup code ...
				; store i64 -1, i64* %device_id1, align 8
				; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				; Wait - this is moved downards.
				; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
				; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
				; store i8* %handle1, i8** %queue1, align 8
				; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
				%conv = zext i32 %size to i64
				%0 = shl nuw nsw i64 %conv, 3
				%1 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 0
				%2 = bitcast [2 x i8] %.offload_baseptrs to double**
				store double* %a, double** %2, align 8
				%3 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 0
				%4 = bitcast [2 x i8] %.offload_ptrs to double**
				store double* %a, double** %4, align 8
				%5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
				store i64 %0, i64* %5, align 8
				%6 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 1
				%7 = bitcast i8 %6 to i32
				store i32* %size.addr, i32** %7, align 8
				%8 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 1
				%9 = bitcast i8 %8 to i32
				store i32* %size.addr, i32** %9, align 8
				%10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
				store i64 4, i64* %10, align 8
				call void @__tgt_target_data_begin(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))

				%11 = load i32, i32* %size.addr, align 4
				%size.casted = zext i32 %11 to i64
				%12 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 0
				%13 = bitcast [2 x i8] %.offload_baseptrs2 to i64*
				store i64 %size.casted, i64* %13, align 8
				%14 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 0
				%15 = bitcast [2 x i8] %.offload_ptrs3 to i64*
				store i64 %size.casted, i64* %15, align 8
				%16 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 1
				%17 = bitcast i8 %16 to double
				store double* %a, double** %17, align 8
				%18 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 1
				%19 = bitcast i8 %18 to double
				store double* %a, double** %19, align 8

				; FIXME: The "wait" should be moved here.
				%20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8 nonnull %12, i8 nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)

				%21 = icmp eq i32 %20, 0
				br i1 %21, label %omp_offload.cont, label %omp_offload.failed

				omp_offload.failed: ; preds = %entry
				call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
				br label %omp_offload.cont

				omp_offload.cont: ; preds = %entry, %omp_offload.failed
				%rem = srem i32 %call, 7
				call void @__tgt_target_data_end(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				ret i32 %rem
				}

				define internal void @heavyComputation2FallBack(i64 %size, double* %a) {
				entry:
				; Fallback for offloading function heavyComputation2.
				ret void
				}

				;int heavyComputation3(double* restrict a, unsigned size) {
				; int random = rand() % 7;
				;
				; //#pragma omp target data map(a[0:size], size)
				; void* args[2];
				; args[0] = &a;
				; args[1] = &size;
				; __tgt_target_data_begin(..., args, ...)
				;
				; #pragma omp target teams
				; for (int i = 0; i < size; ++i) {
				; a[i] = ++a[i] * 3.141624;
				; }
				;
				; return random;
				;}
				define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
				; CHECK-LABEL: define {{[^@]+}}@heavyComputation3(double* noalias %a, i32 %size)
				; CHECK-NEXT: entry:
				; CHECK-NEXT: %size.addr = alloca i32, align 4
				; CHECK-NEXT: %.offload_baseptrs = alloca [2 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs = alloca [2 x i8*], align 8
				; CHECK-NEXT: %.offload_sizes = alloca [2 x i64], align 8
				; CHECK-NEXT: %.offload_baseptrs2 = alloca [2 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs3 = alloca [2 x i8*], align 8
				; CHECK-NEXT: store i32 %size, i32* %size.addr, align 4
				; CHECK-NEXT: %call = call i32 @rand()
				; CHECK-NEXT: %conv = zext i32 %size to i64
				; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3
				; CHECK-NEXT: %1 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 0
				; CHECK-NEXT: %2 = bitcast [2 x i8] %.offload_baseptrs to double**
				; CHECK-NEXT: store double* %a, double** %2, align 8
				; CHECK-NEXT: %3 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 0
				; CHECK-NEXT: %4 = bitcast [2 x i8] %.offload_ptrs to double**
				; CHECK-NEXT: store double* %a, double** %4, align 8
				; CHECK-NEXT: %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
				; CHECK-NEXT: store i64 %0, i64* %5, align 8
				; CHECK-NEXT: %6 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 1
				; CHECK-NEXT: %7 = bitcast i8 %6 to i32
				; CHECK-NEXT: store i32* %size.addr, i32** %7, align 8
				; CHECK-NEXT: %8 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 1
				; CHECK-NEXT: %9 = bitcast i8 %8 to i32
				; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8
				; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
				; CHECK-NEXT: store i64 4, i64* %10, align 8
				; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
				; CHECK-NEXT: %size.casted = zext i32 %11 to i64
				; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 0
				; CHECK-NEXT: %13 = bitcast [2 x i8] %.offload_baseptrs2 to i64*
				; CHECK-NEXT: store i64 %size.casted, i64* %13, align 8
				; CHECK-NEXT: %14 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 0
				; CHECK-NEXT: %15 = bitcast [2 x i8] %.offload_ptrs3 to i64*
				; CHECK-NEXT: store i64 %size.casted, i64* %15, align 8
				; CHECK-NEXT: %16 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 1
				; CHECK-NEXT: %17 = bitcast i8 %16 to double
				; CHECK-NEXT: store double* %a, double** %17, align 8
				; CHECK-NEXT: %18 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 1
				; CHECK-NEXT: %19 = bitcast i8 %18 to double
				; CHECK-NEXT: store double* %a, double** %19, align 8
				; CHECK-NEXT: %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8 nonnull %12, i8 nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
				; CHECK-NEXT: %21 = icmp eq i32 %20, 0
				; CHECK-NEXT: br i1 %21, label %omp_offload.cont, label %omp_offload.failed
				; CHECK: omp_offload.failed:
				; CHECK-NEXT: call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
				; CHECK-NEXT: br label %omp_offload.cont
				; CHECK: omp_offload.cont:
				; CHECK-NEXT: %rem = srem i32 %call, 7
				; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				; CHECK-NEXT: ret i32 %rem
				;
				entry:
				%size.addr = alloca i32, align 4
				%.offload_baseptrs = alloca [2 x i8*], align 8
				%.offload_ptrs = alloca [2 x i8*], align 8
				%.offload_sizes = alloca [2 x i64], align 8
				%.offload_baseptrs2 = alloca [2 x i8*], align 8
				%.offload_ptrs3 = alloca [2 x i8*], align 8

				; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
				; %device_id1 = alloca i64, align 8
				; %async_info1 = alloca %struct.__tgt_async_info, align 8

				store i32 %size, i32* %size.addr, align 4

				; FIXME: The "issue" should be moved here.
				%call = call i32 @rand()

				; FIXME: This setup for the runtime call __tgt_target_data_begin should be
				; split into its "issue" and "wait" counterpars and moved upwards
				; and downwards, respectively. The call should be replaced to something
				; like ...
				; Issue - this is moved upwards.
				; ... setup code ...
				; store i64 -1, i64* %device_id1, align 8
				; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				; Wait - this is moved downards.
				; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
				; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
				; store i8* %handle1, i8** %queue1, align 8
				; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
				%conv = zext i32 %size to i64
				%0 = shl nuw nsw i64 %conv, 3
				%1 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 0
				%2 = bitcast [2 x i8] %.offload_baseptrs to double**
				store double* %a, double** %2, align 8
				%3 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 0
				%4 = bitcast [2 x i8] %.offload_ptrs to double**
				store double* %a, double** %4, align 8
				%5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
				store i64 %0, i64* %5, align 8
				%6 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs, i64 0, i64 1
				%7 = bitcast i8 %6 to i32
				store i32* %size.addr, i32** %7, align 8
				%8 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs, i64 0, i64 1
				%9 = bitcast i8 %8 to i32
				store i32* %size.addr, i32** %9, align 8
				%10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
				store i64 4, i64* %10, align 8
				call void @__tgt_target_data_begin(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))

				%11 = load i32, i32* %size.addr, align 4
				%size.casted = zext i32 %11 to i64
				%12 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 0
				%13 = bitcast [2 x i8] %.offload_baseptrs2 to i64*
				store i64 %size.casted, i64* %13, align 8
				%14 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 0
				%15 = bitcast [2 x i8] %.offload_ptrs3 to i64*
				store i64 %size.casted, i64* %15, align 8
				%16 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_baseptrs2, i64 0, i64 1
				%17 = bitcast i8 %16 to double
				store double* %a, double** %17, align 8
				%18 = getelementptr inbounds [2 x i8], [2 x i8]* %.offload_ptrs3, i64 0, i64 1
				%19 = bitcast i8 %18 to double
				store double* %a, double** %19, align 8

				; FIXME: The "wait" should be moved here.
				%20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8 nonnull %12, i8 nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)

				%21 = icmp eq i32 %20, 0
				br i1 %21, label %omp_offload.cont, label %omp_offload.failed

				omp_offload.failed: ; preds = %entry
				call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
				br label %omp_offload.cont

				omp_offload.cont: ; preds = %entry, %omp_offload.failed
				%rem = srem i32 %call, 7
				call void @__tgt_target_data_end(i64 -1, i32 2, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
				ret i32 %rem
				}

				define internal void @heavyComputation3FallBack(i64 %size, double* %a) {
				entry:
				; Fallback for offloading function heavyComputation3.
				ret void
				}

				;int dataTransferOnly1(double* restrict a, unsigned size) {
				; // Random computation.
				; int random = rand();
				;
				; //#pragma omp target data map(to:a[0:size])
				; void* args[1];
				; args[0] = &a;
				; __tgt_target_data_begin(..., args, ...)
				;
				; // Random computation.
				; random %= size;
				; return random;
				;}
				define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
				; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1(double* noalias %a, i32 %size)
				; CHECK-NEXT: entry:
				; CHECK-NEXT: %.offload_baseptrs = alloca [1 x i8*], align 8
				; CHECK-NEXT: %.offload_ptrs = alloca [1 x i8*], align 8
				; CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
				; CHECK-NEXT: %call = call i32 @rand()
				; CHECK-NEXT: %conv = zext i32 %size to i64
				; CHECK-NEXT: %0 = shl nuw nsw i64 %conv, 3
				; CHECK-NEXT: %1 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_baseptrs, i64 0, i64 0
				; CHECK-NEXT: %2 = bitcast [1 x i8] %.offload_baseptrs to double**
				; CHECK-NEXT: store double* %a, double** %2, align 8
				; CHECK-NEXT: %3 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_ptrs, i64 0, i64 0
				; CHECK-NEXT: %4 = bitcast [1 x i8] %.offload_ptrs to double**
				; CHECK-NEXT: store double* %a, double** %4, align 8
				; CHECK-NEXT: %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
				; CHECK-NEXT: store i64 %0, i64* %5, align 8
				; CHECK-NEXT: call void @__tgt_target_data_begin(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
				; CHECK-NEXT: %rem = urem i32 %call, %size
				; CHECK-NEXT: call void @__tgt_target_data_end(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
				; CHECK-NEXT: ret i32 %rem
				;
				entry:
				%.offload_baseptrs = alloca [1 x i8*], align 8
				%.offload_ptrs = alloca [1 x i8*], align 8
				%.offload_sizes = alloca [1 x i64], align 8

				; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
				; %device_id1 = alloca i64, align 8
				; %async_info1 = alloca %struct.__tgt_async_info, align 8

				; FIXME: The "issue" should be moved here.
				%call = call i32 @rand()

				; FIXME: This setup for the runtime call __tgt_target_data_begin should be
				; split into its "issue" and "wait" counterpars and moved upwards
				; and downwards, respectively. The call should be replaced to something
				; like ...
				; Issue - this is moved upwards.
				; ... setup code ...
				; store i64 -1, i64* %device_id1, align 8
				; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
				; Wait - this is moved downards.
				; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
				; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
				; store i8* %handle1, i8** %queue1, align 8
				; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
				%conv = zext i32 %size to i64
				%0 = shl nuw nsw i64 %conv, 3
				%1 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_baseptrs, i64 0, i64 0
				%2 = bitcast [1 x i8] %.offload_baseptrs to double**
				store double* %a, double** %2, align 8
				%3 = getelementptr inbounds [1 x i8], [1 x i8]* %.offload_ptrs, i64 0, i64 0
				%4 = bitcast [1 x i8] %.offload_ptrs to double**
				store double* %a, double** %4, align 8
				%5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
				store i64 %0, i64* %5, align 8
				call void @__tgt_target_data_begin(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))

				%rem = urem i32 %call, %size

				; FIXME: The "wait" should be moved here.
				call void @__tgt_target_data_end(i64 -1, i32 1, i8 nonnull %1, i8 nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
				ret i32 %rem
				}

				declare dso_local void @__tgt_target_data_begin(i64, i32, i8, i8, i64, i64)

				declare dso_local i32 @__tgt_target_teams(i64, i8, i32, i8, i8, i64, i64*, i32, i32)

				declare dso_local void @__tgt_target_data_end(i64, i32, i8, i8, i64, i64)

				declare dso_local i32 @rand()

				; FIXME: These two function declarations must be generated after splitting the runtime function
				; __tgt_target_data_begin.
				; declare dso_local i8* @__tgt_target_data_begin_issue(i64* dereferenceable(8), i32, i8, i8, i64, i64)
				; declare dso_local void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info* dereferenceable(8))

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMPOpt] Test case 1 - Latency Hiding for Host to Device Memory Transfers
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 275563

llvm/test/Transforms/OpenMP/mem_transfer_hiding.ll

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMPOpt] Test case 1 - Latency Hiding for Host to Device Memory TransfersClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 275563

llvm/test/Transforms/OpenMP/mem_transfer_hiding.ll

[OpenMPOpt] Test case 1 - Latency Hiding for Host to Device Memory Transfers
ClosedPublic