Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -598,6 +598,9 @@ /// caller is __kmpc_parallel_51. BooleanStateWithSetVector ParallelLevels; + /// Flag that indicates if the kernel has nested Parallelism + bool NestedParallelism = false; + /// Abstract State interface ///{ @@ -683,6 +686,7 @@ SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; + NestedParallelism = KIS.NestedParallelism ? true : NestedParallelism; return *this; } @@ -3330,6 +3334,15 @@ if (!KernelInitCB || !KernelDeinitCB) return ChangeStatus::UNCHANGED; + /// Insert nested Parallelism global variable + Function *Kernel = getAnchorScope(); + Module &M = *Kernel->getParent(); + Type *Int1Ty = Type::getInt1Ty(M.getContext()); + auto *NP = new GlobalVariable( + M, Int1Ty, /* isConstant */ true, GlobalValue::WeakAnyLinkage, + ConstantInt::get(Int1Ty, NestedParallelism ? 1 : 0), + Kernel->getName() + "_nested_parallelism"); + // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. ChangeStatus Changed = ChangeStatus::UNCHANGED; @@ -4345,6 +4358,16 @@ if (auto *ParallelRegion = dyn_cast( CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) { ReachedKnownParallelRegions.insert(ParallelRegion); + /// Check nested parallelism + auto *OutlinedRegion = + dyn_cast(CB.getArgOperand(5)->stripPointerCasts()); + auto &FnAA = A.getAAFor( + *this, IRPosition::function(*OutlinedRegion), DepClassTy::REQUIRED); + const bool mayContainParRegion = + !FnAA.ReachedKnownParallelRegions.empty() || + !FnAA.ReachedUnknownParallelRegions.empty(); + if (mayContainParRegion) + NestedParallelism = true; break; } // The condition above should usually get the parallel region function Index: llvm/test/Transforms/OpenMP/nested_parallelism.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/nested_parallelism.ll @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 +@__omp_offloading_10302_7fe18_main_l23_exec_mode = weak protected constant i8 3 +@__omp_offloading_10302_7fe18_main_l27_exec_mode = weak protected constant i8 1 +@i_shared = internal addrspace(3) global [4 x i8] undef, align 16 +@i.i_shared = internal addrspace(3) global [4 x i8] undef, align 16 +@j.i_shared = internal addrspace(3) global [4 x i8] undef, align 16 +@llvm.compiler.used = appending global [2 x ptr] [ptr @__omp_offloading_10302_7fe18_main_l23_exec_mode, ptr @__omp_offloading_10302_7fe18_main_l27_exec_mode], section "llvm.metadata" + +;. +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +; CHECK: @[[__OMP_OFFLOADING_10302_7FE18_MAIN_L23_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak protected constant i8 3 +; CHECK: @[[__OMP_OFFLOADING_10302_7FE18_MAIN_L27_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak protected constant i8 3 +; CHECK: @[[I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16 +; CHECK: @[[I_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16 +; CHECK: @[[J_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16 +; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @__omp_offloading_10302_7fe18_main_l23_exec_mode, ptr @__omp_offloading_10302_7fe18_main_l27_exec_mode], section "llvm.metadata" +; CHECK: @[[__OMP_OFFLOADING_10302_7FE18_MAIN_L23_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i1 true +; CHECK: @[[__OMP_OFFLOADING_10302_7FE18_MAIN_L27_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i1 false +;. +define weak_odr protected void @__omp_offloading_10302_7fe18_main_l23(i64 noundef %i) local_unnamed_addr #0 { +; CHECK-LABEL: @__omp_offloading_10302_7fe18_main_l23( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: user_code.entry: +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR6]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED_I:%.*]], label [[_Z3FOOI_INTERNALIZED_EXIT:%.*]] +; CHECK: region.guarded.i: +; CHECK-NEXT: [[I_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[I:%.*]] to i32 +; CHECK-NEXT: store i32 [[I_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), align 16, !tbaa [[TBAA11:![0-9]+]] +; CHECK-NEXT: br label [[_Z3FOOI_INTERNALIZED_EXIT]] +; CHECK: _Z3fooi.internalized.exit: +; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR6]] +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA15:![0-9]+]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1) #[[ATTR6]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +; CHECK-NEXT: call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2) #[[ATTR6]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + %captured_vars_addrs.i = alloca [1 x ptr], align 8 + %0 = tail call i32 @__kmpc_target_init(ptr nonnull @1, i8 2, i1 false) #6 + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %common.ret + +common.ret: ; preds = %entry, %_Z3fooi.internalized.exit + ret void + +user_code.entry: ; preds = %entry + call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %captured_vars_addrs.i) + %1 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + %2 = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #6 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %region.guarded.i, label %_Z3fooi.internalized.exit + +region.guarded.i: ; preds = %user_code.entry + %i.addr.sroa.0.0.extract.trunc = trunc i64 %i to i32 + store i32 %i.addr.sroa.0.0.extract.trunc, ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), align 16, !tbaa !11 + br label %_Z3fooi.internalized.exit + +_Z3fooi.internalized.exit: ; preds = %user_code.entry, %region.guarded.i + tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @1, i32 %2) + store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr %captured_vars_addrs.i, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull %captured_vars_addrs.i, i64 1) #6 + call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %captured_vars_addrs.i) + call void @__kmpc_target_deinit(ptr nonnull @1, i8 2) #6 + br label %common.ret +} + +declare i32 @__kmpc_target_init(ptr, i8, i1) local_unnamed_addr + +define hidden void @_Z3fooi(i32 noundef %i1) local_unnamed_addr #1 { +; CHECK-LABEL: @_Z3fooi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) +; CHECK-NEXT: store i32 [[I1:%.*]], ptr [[I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I]], i64 4) +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [1 x ptr], align 8 + %0 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + %i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) + store i32 %i1, ptr %i, align 16, !tbaa !11 + store ptr %i, ptr %captured_vars_addrs, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %0, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull %captured_vars_addrs, i64 1) #6 + call void @__kmpc_free_shared(ptr %i, i64 4) + ret void +} + +declare void @__kmpc_target_deinit(ptr, i8) local_unnamed_addr + +define weak_odr protected void @__omp_offloading_10302_7fe18_main_l27(i64 noundef %i) local_unnamed_addr #2 { +; CHECK-LABEL: @__omp_offloading_10302_7fe18_main_l27( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [2 x ptr], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR6]] +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: user_code.entry: +; CHECK-NEXT: [[I_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[I:%.*]] to i32 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] +; CHECK: region.check.tid: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; CHECK: region.guarded: +; CHECK-NEXT: store i32 100, ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store i32 [[I_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspacecast (ptr addrspace(3) @j.i_shared to ptr), align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] +; CHECK: region.guarded.end: +; CHECK-NEXT: br label [[REGION_BARRIER]] +; CHECK: region.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB1]], i32 [[TMP2]]) +; CHECK-NEXT: br label [[REGION_EXIT:%.*]] +; CHECK: region.exit: +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @j.i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 1 +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr [[TMP4]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 2) #[[ATTR6]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +; CHECK-NEXT: call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2) #[[ATTR6]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + %captured_vars_addrs.i = alloca [2 x ptr], align 8 + %0 = tail call i32 @__kmpc_target_init(ptr nonnull @1, i8 1, i1 true) #6 + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %common.ret + +common.ret: ; preds = %entry, %user_code.entry + ret void + +user_code.entry: ; preds = %entry + %i.addr.sroa.0.0.extract.trunc = trunc i64 %i to i32 + call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %captured_vars_addrs.i) + %1 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + store i32 100, ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), align 16, !tbaa !11 + store i32 %i.addr.sroa.0.0.extract.trunc, ptr addrspacecast (ptr addrspace(3) @j.i_shared to ptr), align 16, !tbaa !11 + store ptr addrspacecast (ptr addrspace(3) @j.i_shared to ptr), ptr %captured_vars_addrs.i, align 8, !tbaa !15 + %2 = getelementptr inbounds [2 x ptr], ptr %captured_vars_addrs.i, i64 0, i64 1 + store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr %2, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i, i64 2) #6 + call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %captured_vars_addrs.i) + call void @__kmpc_target_deinit(ptr nonnull @1, i8 1) #6 + br label %common.ret +} + +define hidden void @_Z4foo2ii(i32 noundef %i1, i32 noundef %j2) local_unnamed_addr #1 { +; CHECK-LABEL: @_Z4foo2ii( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[J:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) +; CHECK-NEXT: [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) +; CHECK-NEXT: store i32 [[I1:%.*]], ptr [[I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store i32 [[J2:%.*]], ptr [[J]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store ptr [[J]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +; CHECK-NEXT: store ptr [[I]], ptr [[TMP1]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 2) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I]], i64 4) +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[J]], i64 4) +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [2 x ptr], align 8 + %0 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + %j = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) + %i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) + store i32 %i1, ptr %i, align 16, !tbaa !11 + store i32 %j2, ptr %j, align 16, !tbaa !11 + store ptr %j, ptr %captured_vars_addrs, align 8, !tbaa !15 + %1 = getelementptr inbounds [2 x ptr], ptr %captured_vars_addrs, i64 0, i64 1 + store ptr %i, ptr %1, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %0, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs, i64 2) #6 + call void @__kmpc_free_shared(ptr %i, i64 4) + call void @__kmpc_free_shared(ptr %j, i64 4) + ret void +} + +declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #3 + +define internal void @__omp_outlined__(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid., ptr nocapture noundef nonnull align 4 dereferenceable(4) %i) #4 { +; CHECK-LABEL: @__omp_outlined__( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I_I:%.*]] = alloca [2 x ptr], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[ADD_I:%.*]] = add nsw i32 [[TMP0]], 11 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[J_I_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6]] +; CHECK-NEXT: [[I_I_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I_I_I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store i32 [[ADD_I]], ptr [[J_I_I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store ptr [[J_I_I]], ptr [[CAPTURED_VARS_ADDRS_I_I]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS_I_I]], i64 0, i64 1 +; CHECK-NEXT: store ptr [[I_I_I]], ptr [[TMP2]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]], i64 2) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I_I_I]], i64 4) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[J_I_I]], i64 4) #[[ATTR6]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]]) +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs.i.i = alloca [2 x ptr], align 8 + %0 = load i32, ptr %i, align 4, !tbaa !11 + %inc = add nsw i32 %0, 1 + store i32 %inc, ptr %i, align 4, !tbaa !11 + %add.i = add nsw i32 %0, 11 + call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %captured_vars_addrs.i.i) + %1 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + %j.i.i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #6 + %i.i.i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #6 + store i32 %inc, ptr %i.i.i, align 16, !tbaa !11 + store i32 %add.i, ptr %j.i.i, align 16, !tbaa !11 + store ptr %j.i.i, ptr %captured_vars_addrs.i.i, align 8, !tbaa !15 + %2 = getelementptr inbounds [2 x ptr], ptr %captured_vars_addrs.i.i, i64 0, i64 1 + store ptr %i.i.i, ptr %2, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i.i, i64 2) #6 + call void @__kmpc_free_shared(ptr %i.i.i, i64 4) #6 + call void @__kmpc_free_shared(ptr %j.i.i, i64 4) #6 + call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %captured_vars_addrs.i.i) + ret void +} + +define hidden void @_Z4foo1i(i32 noundef %i) local_unnamed_addr #1 { +; CHECK-LABEL: @_Z4foo1i( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [2 x ptr], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 10 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[J_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) +; CHECK-NEXT: [[I_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) +; CHECK-NEXT: store i32 [[I]], ptr [[I_I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[J_I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store ptr [[J_I]], ptr [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 1 +; CHECK-NEXT: store ptr [[I_I]], ptr [[TMP1]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 2) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I_I]], i64 4) +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[J_I]], i64 4) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs.i = alloca [2 x ptr], align 8 + %add = add nsw i32 %i, 10 + call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %captured_vars_addrs.i) + %0 = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + %j.i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) + %i.i = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) + store i32 %i, ptr %i.i, align 16, !tbaa !11 + store i32 %add, ptr %j.i, align 16, !tbaa !11 + store ptr %j.i, ptr %captured_vars_addrs.i, align 8, !tbaa !15 + %1 = getelementptr inbounds [2 x ptr], ptr %captured_vars_addrs.i, i64 0, i64 1 + store ptr %i.i, ptr %1, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %0, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i, i64 2) #6 + call void @__kmpc_free_shared(ptr %i.i, i64 4) + call void @__kmpc_free_shared(ptr %j.i, i64 4) + call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %captured_vars_addrs.i) + ret void +} + +define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #5 { +; CHECK-LABEL: @__omp_outlined___wrapper( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I_I_I:%.*]] = alloca [2 x ptr], align 8 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]]) #[[ATTR6]] +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP4]], 1 +; CHECK-NEXT: store i32 [[INC_I]], ptr [[TMP3]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[ADD_I_I:%.*]] = add nsw i32 [[TMP4]], 11 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I_I]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[J_I_I_I:%.*]] = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6]] +; CHECK-NEXT: [[I_I_I_I:%.*]] = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6]] +; CHECK-NEXT: store i32 [[INC_I]], ptr [[I_I_I_I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store i32 [[ADD_I_I]], ptr [[J_I_I_I]], align 16, !tbaa [[TBAA11]] +; CHECK-NEXT: store ptr [[J_I_I_I]], ptr [[CAPTURED_VARS_ADDRS_I_I_I]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS_I_I_I]], i64 0, i64 1 +; CHECK-NEXT: store ptr [[I_I_I_I]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I_I]], i64 2) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I_I_I_I]], i64 4) #[[ATTR6]] +; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[J_I_I_I]], i64 4) #[[ATTR6]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I_I]]) +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs.i.i.i = alloca [2 x ptr], align 8 + %global_args = alloca ptr, align 8 + call void @__kmpc_get_shared_variables(ptr nonnull %global_args) #6 + %2 = load ptr, ptr %global_args, align 8 + %3 = load ptr, ptr %2, align 8, !tbaa !15 + %4 = load i32, ptr %3, align 4, !tbaa !11 + %inc.i = add nsw i32 %4, 1 + store i32 %inc.i, ptr %3, align 4, !tbaa !11 + %add.i.i = add nsw i32 %4, 11 + call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %captured_vars_addrs.i.i.i) + %5 = call i32 @__kmpc_global_thread_num(ptr nonnull @1) #6 + %j.i.i.i = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #6 + %i.i.i.i = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #6 + store i32 %inc.i, ptr %i.i.i.i, align 16, !tbaa !11 + store i32 %add.i.i, ptr %j.i.i.i, align 16, !tbaa !11 + store ptr %j.i.i.i, ptr %captured_vars_addrs.i.i.i, align 8, !tbaa !15 + %6 = getelementptr inbounds [2 x ptr], ptr %captured_vars_addrs.i.i.i, i64 0, i64 1 + store ptr %i.i.i.i, ptr %6, align 8, !tbaa !15 + call void @__kmpc_parallel_51(ptr nonnull @1, i32 %5, i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull %captured_vars_addrs.i.i.i, i64 2) #6 + call void @__kmpc_free_shared(ptr %i.i.i.i, i64 4) #6 + call void @__kmpc_free_shared(ptr %j.i.i.i, i64 4) #6 + call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %captured_vars_addrs.i.i.i) + ret void +} + +declare void @__kmpc_get_shared_variables(ptr) local_unnamed_addr + +declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #6 + +declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64) local_unnamed_addr #7 + +declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #8 + +define internal void @__omp_outlined__1(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid., ptr nocapture noundef nonnull align 4 dereferenceable(4) %j, ptr nocapture noundef nonnull readonly align 4 dereferenceable(4) %i) #9 { +; CHECK-LABEL: @__omp_outlined__1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[J:%.*]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I:%.*]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[J]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: ret void +; +entry: + %0 = load i32, ptr %j, align 4, !tbaa !11 + %1 = load i32, ptr %i, align 4, !tbaa !11 + %add = add nsw i32 %1, %0 + store i32 %add, ptr %j, align 4, !tbaa !11 + ret void +} + +define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #5 { +; CHECK-LABEL: @__omp_outlined__1_wrapper( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]]) #[[ATTR6]] +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[TBAA15]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: [[ADD_I:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: store i32 [[ADD_I]], ptr [[TMP3]], align 4, !tbaa [[TBAA11]] +; CHECK-NEXT: ret void +; +entry: + %global_args = alloca ptr, align 8 + call void @__kmpc_get_shared_variables(ptr nonnull %global_args) #6 + %2 = load ptr, ptr %global_args, align 8 + %3 = load ptr, ptr %2, align 8, !tbaa !15 + %4 = getelementptr inbounds ptr, ptr %2, i64 1 + %5 = load ptr, ptr %4, align 8, !tbaa !15 + %6 = load i32, ptr %3, align 4, !tbaa !11 + %7 = load i32, ptr %5, align 4, !tbaa !11 + %add.i = add nsw i32 %7, %6 + store i32 %add.i, ptr %3, align 4, !tbaa !11 + ret void +} + +declare i32 @__kmpc_get_hardware_thread_id_in_block() local_unnamed_addr + +declare void @__kmpc_barrier_simple_spmd(ptr, i32) local_unnamed_addr #10 + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #11 + +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #11 + +attributes #0 = { alwaysinline convergent norecurse nounwind "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +attributes #1 = { mustprogress nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +attributes #2 = { alwaysinline norecurse nounwind "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +attributes #3 = { nofree nosync nounwind allocsize(0) } +attributes #4 = { alwaysinline norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +attributes #5 = { norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +attributes #6 = { nounwind } +attributes #7 = { alwaysinline } +attributes #8 = { nosync nounwind } +attributes #9 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +attributes #10 = { convergent nounwind } +attributes #11 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } + +!omp_offload.info = !{!0, !1} +!nvvm.annotations = !{!2, !3} +!llvm.module.flags = !{!4, !5, !6, !7, !8, !9} +!llvm.ident = !{!10} + +!0 = !{i32 0, i32 66306, i32 523800, !"main", i32 23, i32 0, i32 0} +!1 = !{i32 0, i32 66306, i32 523800, !"main", i32 27, i32 0, i32 1} +!2 = !{ptr @__omp_offloading_10302_7fe18_main_l23, !"kernel", i32 1} +!3 = !{ptr @__omp_offloading_10302_7fe18_main_l27, !"kernel", i32 1} +!4 = !{i32 7, !"Dwarf Version", i32 2} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{i32 7, !"openmp", i32 50} +!7 = !{i32 7, !"openmp-device", i32 50} +!8 = !{i32 8, !"PIC Level", i32 2} +!9 = !{i32 7, !"frame-pointer", i32 2} +!10 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 817f64e7ce545a2d0ec4484b4066cb73ddb31fdd)"} +!11 = !{!12, !12, i64 0} +!12 = !{!"int", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C++ TBAA"} +!15 = !{!16, !16, i64 0} +!16 = !{!"any pointer", !13, i64 0} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { alwaysinline convergent norecurse nounwind "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { alwaysinline norecurse nounwind "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nofree nosync nounwind allocsize(0) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { alwaysinline norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +; CHECK: attributes #[[ATTR6]] = { nounwind } +; CHECK: attributes #[[ATTR7:[0-9]+]] = { alwaysinline } +; CHECK: attributes #[[ATTR8:[0-9]+]] = { nosync nounwind } +; CHECK: attributes #[[ATTR9:[0-9]+]] = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_80" "target-features"="+ptx42,+sm_80" } +; CHECK: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR11:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 523800, !"main", i32 23, i32 0, i32 0} +; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 66306, i32 523800, !"main", i32 27, i32 0, i32 1} +; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_10302_7fe18_main_l23, !"kernel", i32 1} +; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_10302_7fe18_main_l27, !"kernel", i32 1} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2} +; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META7:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META8:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK: [[META9:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META10:![0-9]+]] = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 817f64e7ce545a2d0ec4484b4066cb73ddb31fdd)"} +; CHECK: [[TBAA11]] = !{!12, !12, i64 0} +; CHECK: [[META12:![0-9]+]] = !{!"int", !13, i64 0} +; CHECK: [[META13:![0-9]+]] = !{!"omnipotent char", !14, i64 0} +; CHECK: [[META14:![0-9]+]] = !{!"Simple C++ TBAA"} +; CHECK: [[TBAA15]] = !{!16, !16, i64 0} +; CHECK: [[META16:![0-9]+]] = !{!"any pointer", !13, i64 0} +;.