diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -35,7 +35,7 @@ /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); OMPRTL_NVPTX__kmpc_kernel_deinit, /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); + /// int16_t RequiresOMPRuntime); OMPRTL_NVPTX__kmpc_spmd_kernel_init, /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2, @@ -1345,8 +1345,7 @@ llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), /*RequiresOMPRuntime=*/ - Bld.getInt16(RequiresFullRuntime ? 1 : 0), - /*RequiresDataSharing=*/Bld.getInt16(0)}; + Bld.getInt16(RequiresFullRuntime ? 1 : 0)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); @@ -1561,7 +1560,7 @@ case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; + llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty}; auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init"); diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp --- a/clang/test/OpenMP/amdgcn_target_codegen.cpp +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -32,7 +32,7 @@ // CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0) // CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32 -// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0) +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0) #pragma omp target simd for (int i = 0; i < N; i++) { arr[i] = 1; diff --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp --- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp +++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp @@ -21,28 +21,28 @@ // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1 void foo() { -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams distribute parallel for simd if(a) @@ -67,28 +67,28 @@ for (int i = 0; i < 10; ++i) ; int a; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams distribute parallel for lastprivate(a) @@ -112,28 +112,28 @@ #pragma omp target teams distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams @@ -175,28 +175,28 @@ #pragma omp distribute parallel for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams @@ -227,28 +227,28 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target @@ -286,22 +286,22 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] #pragma omp target parallel for if(a) for (int i = 0; i < 10; ++i) @@ -324,28 +324,28 @@ #pragma omp target parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] #pragma omp target parallel if(a) @@ -376,27 +376,27 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] #pragma omp target @@ -434,22 +434,22 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK-DAG: [[FULL]] #pragma omp target #pragma omp parallel for diff --git a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp --- a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp +++ b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp @@ -11,13 +11,13 @@ // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1 void foo() { -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target teams distribute parallel for simd for (int i = 0; i < 10; ++i) ; @@ -40,13 +40,13 @@ for (int i = 0; i < 10; ++i) ; int a; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target teams distribute parallel for lastprivate(a) for (int i = 0; i < 10; ++i) a = i; @@ -68,13 +68,13 @@ #pragma omp target teams distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target teams #pragma omp distribute parallel for simd for (int i = 0; i < 10; ++i) @@ -103,13 +103,13 @@ #pragma omp distribute parallel for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target teams #pragma omp distribute parallel for for (int i = 0; i < 10; ++i) @@ -138,13 +138,13 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target #pragma omp teams #pragma omp distribute parallel for @@ -180,13 +180,13 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target parallel for for (int i = 0; i < 10; ++i) ; @@ -208,13 +208,13 @@ #pragma omp target parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target parallel #pragma omp for simd for (int i = 0; i < 10; ++i) @@ -243,13 +243,13 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target #pragma omp parallel #pragma omp for simd ordered @@ -285,13 +285,13 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) #pragma omp target #pragma omp parallel for for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -61,7 +61,7 @@ // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // @@ -99,7 +99,7 @@ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -53,7 +53,7 @@ // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], @@ -84,7 +84,7 @@ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -54,7 +54,7 @@ } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // @@ -72,7 +72,7 @@ // CHECK: } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // @@ -90,7 +90,7 @@ // CHECK: } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -52,774 +52,756 @@ return a; } - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXECUTE:.+]] - // - // CHECK: [[EXECUTE]] - // CHECK: {{call|invoke}} void [[PFN:@.+]](i32* - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // - // - // define internal void [[PFN]]( - // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align - // CHECK: [[EV:%.+]] = load double, double* [[E]], align - // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 - // CHECK: store double [[ADD]], double* [[E]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* - // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[CMP]], label - - // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align - // CHECK: [[EV:%.+]] = load double, double* [[E]], align - // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] - // CHECK: store double [[ADD]], double* [[E_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label - // - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], - // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], - // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], - // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], - // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] - // CHECK: store double [[RES]], double* [[VAR_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT:%.+]] = alloca double - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // - // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* - // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* - // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align - // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* - // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], - // CHECK: br label - // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], - // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 - // CHECK: br i1 [[DONE_COPY]], label - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 - // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], - // CHECK: br label - // CHECK: ret - - - - - - - - - - - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXECUTE:.+]] - // - // CHECK: [[EXECUTE]] - // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32* - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // - // - // define internal void [[PFN1]]( - // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align - // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align - // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} - // CHECK: store float [[MUL]], float* [[D]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: store i8* [[C]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* - // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[CMP]], label - // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align - // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 - // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align - // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align - // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] - // CHECK: store float [[MUL]], float* [[D_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label - // - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* - // - // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], - // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 - // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], - // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: store float [[RES]], float* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca float - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 - // - // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align - // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // - // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* - // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align - // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align - // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: ret - - - - - - - - - - - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd - // CHECK: br label {{%?}}[[EXECUTE:.+]] - // - // CHECK: [[EXECUTE]] - // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32* - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // - // - // define internal void [[PFN2]]( - // CHECK: store i32 0, i32* [[A:%.+]], align - // CHECK: store i16 -32768, i16* [[B:%.+]], align - // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align - // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 - // CHECK: store i32 [[OR]], i32* [[A]], align - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align - // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 - // CHECK: store i16 [[TRUNC]], i16* [[B]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* - // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* - // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[CMP]], label - - // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align - // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align - // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] - // CHECK: store i32 [[OR]], i32* [[A_IN]], align - // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align - // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label - // - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* - // - // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], - // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], - // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 - // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 - // - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align - // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* - // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 - // - // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ - // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: ret +// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( +// +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXECUTE:.+]] +// +// CHECK: [[EXECUTE]] +// CHECK: {{call|invoke}} void [[PFN:@.+]](i32* +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// +// +// define internal void [[PFN]]( +// CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align +// CHECK: [[EV:%.+]] = load double, double* [[E]], align +// CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 +// CHECK: store double [[ADD]], double* [[E]], align +// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* +// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align +// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* +// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) +// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 +// CHECK: br i1 [[CMP]], label + +// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align +// CHECK: [[EV:%.+]] = load double, double* [[E]], align +// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] +// CHECK: store double [[ADD]], double* [[E_IN]], align +// CHECK: call void @__kmpc_nvptx_end_reduce_nowait( +// CHECK: br label +// +// CHECK: ret + +// +// Reduction function +// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) +// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], +// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* +// +// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], +// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* +// +// CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], +// CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], +// CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] +// CHECK: store double [[RES]], double* [[VAR_LHS]], +// CHECK: ret void + +// +// Shuffle and reduce function +// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) +// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align +// CHECK: [[REMOTE_ELT:%.+]] = alloca double +// +// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align +// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align +// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align +// +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* +// +// CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* +// CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* +// CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align +// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 +// CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) +// +// CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align +// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* +// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align +// +// Condition to reduce +// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 +// +// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 +// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] +// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] +// +// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 +// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 +// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 +// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] +// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 +// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] +// +// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] +// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] +// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] +// +// CHECK: [[DO_REDUCE]] +// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* +// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* +// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) +// CHECK: br label {{%?}}[[REDUCE_CONT:.+]] +// +// CHECK: [[REDUCE_ELSE]] +// CHECK: br label {{%?}}[[REDUCE_CONT]] +// +// CHECK: [[REDUCE_CONT]] +// Now check if we should just copy over the remote reduction list +// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 +// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] +// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] +// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// CHECK: [[DO_COPY]] +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* +// CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align +// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// CHECK: [[COPY_CONT]] +// CHECK: void + +// +// Inter warp copy function +// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) +// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 +// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 +// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* +// CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], +// CHECK: br label +// CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], +// CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 +// CHECK: br i1 [[DONE_COPY]], label +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 +// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// [[DO_COPY]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] +// +// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] +// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], +// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// Barrier after copy to shared memory storage medium. +// CHECK: [[COPY_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* +// +// Read into warp 0. +// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] +// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] +// +// CHECK: [[DO_READ]] +// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] +// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], +// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], +// CHECK: br label {{%?}}[[READ_CONT:.+]] +// +// CHECK: [[READ_ELSE]] +// CHECK: br label {{%?}}[[READ_CONT]] +// +// CHECK: [[READ_CONT]] +// CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 +// CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], +// CHECK: br label +// CHECK: ret + +// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( +// +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXECUTE:.+]] +// +// CHECK: [[EXECUTE]] +// CHECK: {{call|invoke}} void [[PFN1:@.+]](i32* +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// +// +// define internal void [[PFN1]]( +// CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align +// CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align +// CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 +// CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 +// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 +// CHECK: store i8 [[TRUNC]], i8* [[C]], align +// CHECK: [[DV:%.+]] = load float, float* [[D]], align +// CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} +// CHECK: store float [[MUL]], float* [[D]], align +// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: store i8* [[C]], i8** [[PTR1]], align +// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* +// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align +// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* +// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) +// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 +// CHECK: br i1 [[CMP]], label +// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align +// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 +// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align +// CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 +// CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] +// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 +// CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align +// CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align +// CHECK: [[DV:%.+]] = load float, float* [[D]], align +// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] +// CHECK: store float [[MUL]], float* [[D_IN]], align +// CHECK: call void @__kmpc_nvptx_end_reduce_nowait( +// CHECK: br label +// +// CHECK: ret + +// +// Reduction function +// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) +// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], +// +// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], +// +// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], +// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* +// +// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], +// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* +// +// CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], +// CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 +// CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], +// CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 +// CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] +// CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 +// CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], +// +// CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], +// CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], +// CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] +// CHECK: store float [[RES]], float* [[VAR2_LHS]], +// CHECK: ret void + +// +// Shuffle and reduce function +// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) +// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align +// CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 +// CHECK: [[REMOTE_ELT2:%.+]] = alloca float +// +// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align +// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align +// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align +// +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align +// +// CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 +// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 +// CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) +// CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 +// +// CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align +// CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align +// +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* +// +// CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* +// CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* +// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align +// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 +// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) +// +// CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align +// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* +// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align +// +// Condition to reduce +// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 +// +// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 +// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] +// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] +// +// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 +// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 +// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 +// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] +// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 +// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] +// +// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] +// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] +// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] +// +// CHECK: [[DO_REDUCE]] +// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* +// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* +// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) +// CHECK: br label {{%?}}[[REDUCE_CONT:.+]] +// +// CHECK: [[REDUCE_ELSE]] +// CHECK: br label {{%?}}[[REDUCE_CONT]] +// +// CHECK: [[REDUCE_CONT]] +// Now check if we should just copy over the remote reduction list +// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 +// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] +// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] +// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// CHECK: [[DO_COPY]] +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align +// CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align +// +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* +// CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align +// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// CHECK: [[COPY_CONT]] +// CHECK: void + +// +// Inter warp copy function +// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) +// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 +// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 +// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 +// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// [[DO_COPY]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// +// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] +// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* +// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align +// CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// Barrier after copy to shared memory storage medium. +// CHECK: [[COPY_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* +// +// Read into warp 0. +// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] +// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] +// +// CHECK: [[DO_READ]] +// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] +// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align +// CHECK: br label {{%?}}[[READ_CONT:.+]] +// +// CHECK: [[READ_ELSE]] +// CHECK: br label {{%?}}[[READ_CONT]] +// +// CHECK: [[READ_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 +// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// [[DO_COPY]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// +// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] +// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align +// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// Barrier after copy to shared memory storage medium. +// CHECK: [[COPY_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* +// +// Read into warp 0. +// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] +// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] +// +// CHECK: [[DO_READ]] +// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align +// CHECK: br label {{%?}}[[READ_CONT:.+]] +// +// CHECK: [[READ_ELSE]] +// CHECK: br label {{%?}}[[READ_CONT]] +// +// CHECK: [[READ_CONT]] +// CHECK: ret + +// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( +// +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: br label {{%?}}[[EXECUTE:.+]] +// +// CHECK: [[EXECUTE]] +// CHECK: {{call|invoke}} void [[PFN2:@.+]](i32* +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// +// +// define internal void [[PFN2]]( +// CHECK: store i32 0, i32* [[A:%.+]], align +// CHECK: store i16 -32768, i16* [[B:%.+]], align +// CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align +// CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 +// CHECK: store i32 [[OR]], i32* [[A]], align +// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align +// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 +// CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] +// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] +// +// CHECK: [[DO_MAX]] +// CHECK: br label {{%?}}[[MAX_CONT:.+]] +// +// CHECK: [[MAX_ELSE]] +// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align +// CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 +// CHECK: br label {{%?}}[[MAX_CONT]] +// +// CHECK: [[MAX_CONT]] +// CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] +// CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 +// CHECK: store i16 [[TRUNC]], i16* [[B]], align +// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* +// CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align +// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* +// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align +// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* +// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) +// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 +// CHECK: br i1 [[CMP]], label + +// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align +// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align +// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] +// CHECK: store i32 [[OR]], i32* [[A_IN]], align +// CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align +// CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 +// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align +// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] +// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] +// +// CHECK: [[DO_MAX]] +// CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align +// CHECK: br label {{%?}}[[MAX_CONT:.+]] +// +// CHECK: [[MAX_ELSE]] +// CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align +// CHECK: br label {{%?}}[[MAX_CONT]] +// +// CHECK: [[MAX_CONT]] +// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] +// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align +// CHECK: call void @__kmpc_nvptx_end_reduce_nowait( +// CHECK: br label +// +// CHECK: ret + +// +// Reduction function +// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) +// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], +// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* +// +// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], +// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* +// +// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], +// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* +// +// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], +// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* +// +// CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], +// CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], +// CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] +// CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], +// +// CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], +// CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 +// CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], +// CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 +// +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] +// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] +// +// CHECK: [[DO_MAX]] +// CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align +// CHECK: br label {{%?}}[[MAX_CONT:.+]] +// +// CHECK: [[MAX_ELSE]] +// CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align +// CHECK: br label {{%?}}[[MAX_CONT]] +// +// CHECK: [[MAX_CONT]] +// CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] +// CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], +// CHECK: ret void + +// +// Shuffle and reduce function +// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) +// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align +// CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 +// CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 +// +// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align +// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align +// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align +// +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align +// +// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 +// CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) +// +// CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align +// CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* +// CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align +// +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* +// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align +// +// CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 +// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 +// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) +// CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 +// +// CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align +// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* +// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align +// +// Condition to reduce +// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 +// +// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 +// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] +// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] +// +// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 +// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 +// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 +// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] +// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 +// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] +// +// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] +// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] +// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] +// +// CHECK: [[DO_REDUCE]] +// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* +// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* +// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) +// CHECK: br label {{%?}}[[REDUCE_CONT:.+]] +// +// CHECK: [[REDUCE_ELSE]] +// CHECK: br label {{%?}}[[REDUCE_CONT]] +// +// CHECK: [[REDUCE_CONT]] +// Now check if we should just copy over the remote reduction list +// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 +// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] +// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] +// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// CHECK: [[DO_COPY]] +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align +// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align +// +// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* +// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align +// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// CHECK: [[COPY_CONT]] +// CHECK: void + +// +// Inter warp copy function +// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) +// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 +// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 +// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 +// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// [[DO_COPY]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// +// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] +// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align +// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// Barrier after copy to shared memory storage medium. +// CHECK: [[COPY_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* +// +// Read into warp 0. +// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] +// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] +// +// CHECK: [[DO_READ]] +// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* +// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align +// CHECK: br label {{%?}}[[READ_CONT:.+]] +// +// CHECK: [[READ_ELSE]] +// CHECK: br label {{%?}}[[READ_CONT]] +// +// CHECK: [[READ_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 +// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] +// +// [[DO_COPY]] +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* +// +// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] +// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* +// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align +// CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: br label {{%?}}[[COPY_CONT:.+]] +// +// CHECK: [[COPY_ELSE]] +// CHECK: br label {{%?}}[[COPY_CONT]] +// +// Barrier after copy to shared memory storage medium. +// CHECK: [[COPY_CONT]] +// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ +// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* +// +// Read into warp 0. +// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] +// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] +// +// CHECK: [[DO_READ]] +// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] +// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* +// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], +// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* +// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align +// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align +// CHECK: br label {{%?}}[[READ_CONT:.+]] +// +// CHECK: [[READ_ELSE]] +// CHECK: br label {{%?}}[[READ_CONT]] +// +// CHECK: [[READ_CONT]] +// CHECK: ret #endif diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp @@ -61,28 +61,28 @@ } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait( diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -232,7 +232,7 @@ // CHECK: ret void // CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( // CHECK-NOT: call void @__kmpc_serialized_parallel( diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -100,7 +100,7 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void [[PARALLEL:@.+]]( // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) @@ -128,7 +128,7 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void @@ -143,7 +143,7 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void @@ -159,7 +159,7 @@ // Distribute with collapse(2) // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -83,7 +83,7 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], @@ -109,7 +109,7 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, @@ -124,7 +124,7 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, @@ -140,7 +140,7 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp @@ -70,7 +70,7 @@ } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, @@ -78,7 +78,7 @@ // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, @@ -86,7 +86,7 @@ // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48( -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, @@ -95,7 +95,7 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, -// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -92,15 +92,7 @@ void *DataEnd; char Data[DS_Worker_Warp_Slot_Size]; }; -// Additional master slot type which is initialized with the default master slot -// size of 4 bytes. -struct __kmpc_data_sharing_master_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Slot_Size]; -}; + extern DEVICE SHARED DataSharingStateTy DataSharingState; //////////////////////////////////////////////////////////////////////////////// @@ -204,37 +196,6 @@ // init INLINE void InitTeamDescr(); - INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { - // If this is invoked by the master thread of the master warp then - // initialize it with a smaller slot. - if (IsMasterThread) { - // Do not initialize this slot again if it has already been initalized. - if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) - return 0; - // Initialize the pointer to the end of the slot given the size of the - // data section. DataEnd is non-inclusive. - master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; - // We currently do not have a next slot. - master_rootS[0].Next = 0; - master_rootS[0].Prev = 0; - master_rootS[0].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&master_rootS[0]; - } - // Do not initialize this slot again if it has already been initalized. - if (worker_rootS[wid].DataEnd == - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) - return 0; - // Initialize the pointer to the end of the slot given the size of the data - // section. DataEnd is non-inclusive. - worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; - // We currently do not have a next slot. - worker_rootS[wid].Next = 0; - worker_rootS[wid].Prev = 0; - worker_rootS[wid].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; - } - INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { worker_rootS[wid].DataEnd = &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; @@ -253,7 +214,6 @@ ALIGN(16) __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number]; - ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -77,8 +77,7 @@ omptarget_nvptx_workFn = 0; } -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, - int16_t RequiresDataSharing) { +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized @@ -134,15 +133,6 @@ "thread will execute parallel region with id %d in a team of " "%d threads\n", (int)newTaskDescr->ThreadId(), (int)ThreadLimit); - - if (RequiresDataSharing && GetLaneId() == 0) { - // Warp master initializes data sharing environment. - unsigned WID = threadId / WARPSIZE; - __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS( - WID, WID == WARPSIZE - 1); - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } } EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -421,8 +421,8 @@ // non standard EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, - int16_t RequiresDataSharing); +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, + int16_t RequiresOMPRuntime); EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); EXTERN bool __kmpc_kernel_parallel(void **WorkFn);