diff --git a/clang/test/OpenMP/target_alloc_shared_emission.cpp b/clang/test/OpenMP/target_alloc_shared_emission.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/target_alloc_shared_emission.cpp @@ -0,0 +1,1805 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ +// REQUIRES: amdgpu-registered-target + + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host-amd.bc +// RUN: %clang_cc1 -verify -O2 -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host-amd.bc -o - | FileCheck %s --check-prefix=CHECK-AMD + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host-nvidia.bc +// RUN: %clang_cc1 -verify -O2 -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-unknown-unknown -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host-nvidia.bc -o - | FileCheck %s --check-prefix=CHECK-NVIDIA + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +#define N 1000 + +void foo(int *stack); + +void emits_alloc_shared(const int *localPadding , int *res) +{ + int stack[64]; + int stackptr = 0; + stack[stackptr++] = -1; + *res = 0; + + do + { + if(localPadding[0] > 0) + stack[stackptr++] = 0; + *res = stack[--stackptr]; + foo(&stack[2]); + } while (*res > 0); +} + +int emits_alloc_shared_caller() { + const int maz = 1; + const int may = 2; + const int max = 3; + int res; + int localPadding[N]; +#pragma omp target teams distribute parallel for map(tofrom: localPadding[:N],maz, may, max) + + for (int pi = 0; pi < N; pi++) + { + for (int hz = 0; hz <= maz; hz++) + for (int hy = 0; hy <= may; hy++) + for (int hx = 0; hx <= max; hx++) { + emits_alloc_shared(localPadding, &res); + } + localPadding[pi] = res; + } + return localPadding[17]; +} + +void does_not_emit_alloc_shared(const int *localPadding , int *res) { + int stack[64]; + int stackptr = 0; + stack[stackptr++] = -1; + *res = 0; + + do + { + if(localPadding[0] > 0) + stack[stackptr++] = 0; + *res = stack[--stackptr]; + } while (*res > 0); +} + +int does_not_emit_alloc_shared_caller() { + const int maz = 1; + const int may = 2; + const int max = 3; + int res; + int localPadding[N]; +#pragma omp target teams distribute parallel for map(tofrom: localPadding[:N],maz, may, max) + + for (int pi = 0; pi < N; pi++) + { + for (int hz = 0; hz <= maz; hz++) + for (int hy = 0; hy <= may; hy++) + for (int hx = 0; hx <= max; hx++) { + // emits_alloc_shared(localPadding, &res); + does_not_emit_alloc_shared(localPadding, &res); + } + localPadding[pi] = res; + } + return localPadding[17]; +} + +#endif +// CHECK-AMD-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43 +// CHECK-AMD-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 noundef [[RES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-AMD-NEXT: entry: +// CHECK-AMD-NEXT: [[DOTOMP_COMB_LB_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_COMB_UB_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [4 x ptr], align 8, addrspace(5) +// CHECK-AMD-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false) #[[ATTR2:[0-9]+]] +// CHECK-AMD-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +// CHECK-AMD-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +// CHECK-AMD: common.ret: +// CHECK-AMD-NEXT: ret void +// CHECK-AMD: user_code.entry: +// CHECK-AMD-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-AMD-NEXT: [[DOTOMP_COMB_LB_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB_I]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_COMB_UB_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB_I]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE_I]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST_I]] to ptr +// CHECK-AMD-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]] to ptr +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_LB_I]]) #[[ATTR12:[0-9]+]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA9:![0-9]+]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 999, ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 1, ptr addrspace(5) [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[DOTOMP_IS_LAST_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[NVPTX_NUM_THREADS_I:%.*]] = tail call i32 @__kmpc_get_hardware_num_threads_in_block() #[[ATTR2]] +// CHECK-AMD-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP1]], i32 91, ptr nonnull [[DOTOMP_IS_LAST_ASCAST_I]], ptr nonnull [[DOTOMP_COMB_LB_ASCAST_I]], ptr nonnull [[DOTOMP_COMB_UB_ASCAST_I]], ptr nonnull [[DOTOMP_STRIDE_ASCAST_I]], i32 1, i32 [[NVPTX_NUM_THREADS_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4 +// CHECK-AMD-NEXT: [[COND_I:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 999) +// CHECK-AMD-NEXT: store i32 [[COND_I]], ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[DOTOMP_IV_01_I:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4 +// CHECK-AMD-NEXT: [[CMP12_I:%.*]] = icmp slt i32 [[DOTOMP_IV_01_I]], 1000 +// CHECK-AMD-NEXT: br i1 [[CMP12_I]], label [[OMP_INNER_FOR_BODY_LR_PH_I:%.*]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined.exit +// CHECK-AMD: omp.inner.for.body.lr.ph.i: +// CHECK-AMD-NEXT: [[RES_CASTED_SROA_0_0_INSERT_EXT:%.*]] = and i64 [[RES]], 4294967295 +// CHECK-AMD-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], i32 0, i32 1 +// CHECK-AMD-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], i32 0, i32 2 +// CHECK-AMD-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], i32 0, i32 3 +// CHECK-AMD-NEXT: [[TMP6:%.*]] = inttoptr i64 [[RES_CASTED_SROA_0_0_INSERT_EXT]] to ptr +// CHECK-AMD-NEXT: br label [[OMP_INNER_FOR_BODY_I:%.*]] +// CHECK-AMD: omp.inner.for.body.i: +// CHECK-AMD-NEXT: [[DOTOMP_IV_04_I:%.*]] = phi i32 [ [[DOTOMP_IV_01_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[ADD2_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-AMD-NEXT: [[STOREMERGE3_I:%.*]] = phi i32 [ [[COND_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[COND8_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-AMD-NEXT: [[TMP7:%.*]] = zext i32 [[DOTOMP_IV_04_I]] to i64 +// CHECK-AMD-NEXT: [[TMP8:%.*]] = zext i32 [[STOREMERGE3_I]] to i64 +// CHECK-AMD-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP7]] to ptr +// CHECK-AMD-NEXT: store ptr [[TMP9]], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA13:![0-9]+]] +// CHECK-AMD-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr +// CHECK-AMD-NEXT: store ptr [[TMP10]], ptr addrspace(5) [[TMP3]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: store ptr [[LOCALPADDING]], ptr addrspace(5) [[TMP4]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: store ptr [[TMP6]], ptr addrspace(5) [[TMP5]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined_omp_outlined, ptr null, ptr nonnull [[CAPTURED_VARS_ADDRS_ASCAST_I]], i64 4) #[[ATTR2]] +// CHECK-AMD-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ADD2_I]] = add nsw i32 [[TMP12]], [[TMP11]] +// CHECK-AMD-NEXT: store i32 [[ADD2_I]], ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ADD3_I:%.*]] = add nsw i32 [[TMP13]], [[TMP11]] +// CHECK-AMD-NEXT: [[COND8_I]] = call i32 @llvm.smin.i32(i32 [[ADD3_I]], i32 999) +// CHECK-AMD-NEXT: store i32 [[COND8_I]], ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP1_I:%.*]] = icmp slt i32 [[ADD2_I]], 1000 +// CHECK-AMD-NEXT: br i1 [[CMP1_I]], label [[OMP_INNER_FOR_BODY_I]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined.exit +// CHECK-AMD: {{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined.exit: +// CHECK-AMD-NEXT: call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP1]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_LB_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-AMD-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2) #[[ATTR2]] +// CHECK-AMD-NEXT: br label [[COMMON_RET]] +// +// +// CHECK-AMD-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined_omp_outlined +// CHECK-AMD-SAME: (ptr noalias nocapture noundef readonly [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture readnone [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr nocapture noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 [[RES:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-AMD-NEXT: entry: +// CHECK-AMD-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: [[CONV:%.*]] = trunc i64 [[DOTPREVIOUS_LB_]] to i32 +// CHECK-AMD-NEXT: [[CONV1:%.*]] = trunc i64 [[DOTPREVIOUS_UB_]] to i32 +// CHECK-AMD-NEXT: store i32 [[CONV]], ptr addrspace(5) [[DOTOMP_LB]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: store i32 [[CONV1]], ptr addrspace(5) [[DOTOMP_UB]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 1, ptr addrspace(5) [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP0]], i32 33, ptr nonnull [[DOTOMP_IS_LAST_ASCAST]], ptr nonnull [[DOTOMP_LB_ASCAST]], ptr nonnull [[DOTOMP_UB_ASCAST]], ptr nonnull [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1) #[[ATTR2]] +// CHECK-AMD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_LB]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CONV229:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-AMD-NEXT: [[CMP_NOT30:%.*]] = icmp ugt i64 [[CONV229]], [[DOTPREVIOUS_UB_]] +// CHECK-AMD-NEXT: br i1 [[CMP_NOT30]], label [[OMP_LOOP_EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]] +// CHECK-AMD: for.cond.preheader: +// CHECK-AMD-NEXT: [[CONV232:%.*]] = phi i64 [ [[CONV2:%.*]], [[FOR_COND_CLEANUP:%.*]] ], [ [[CONV229]], [[ENTRY:%.*]] ] +// CHECK-AMD-NEXT: [[DOTOMP_IV_031:%.*]] = phi i32 [ [[ADD18:%.*]], [[FOR_COND_CLEANUP]] ], [ [[TMP1]], [[ENTRY]] ] +// CHECK-AMD-NEXT: br label [[FOR_COND4_PREHEADER:%.*]] +// CHECK-AMD: for.cond4.preheader: +// CHECK-AMD-NEXT: [[CMP3:%.*]] = phi i1 [ true, [[FOR_COND_PREHEADER]] ], [ false, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_2:%.*]] ] +// CHECK-AMD-NEXT: [[STACK_I:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I:%.*]] +// CHECK-AMD: for.cond.cleanup: +// CHECK-AMD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[LOCALPADDING]], i64 0, i64 [[CONV232]] +// CHECK-AMD-NEXT: store i32 [[TMP26:%.*]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ADD18]] = add nsw i32 [[TMP2]], [[DOTOMP_IV_031]] +// CHECK-AMD-NEXT: [[CONV2]] = sext i32 [[ADD18]] to i64 +// CHECK-AMD-NEXT: [[CMP_NOT:%.*]] = icmp ugt i64 [[CONV2]], [[DOTPREVIOUS_UB_]] +// CHECK-AMD-NEXT: br i1 [[CMP_NOT]], label [[OMP_LOOP_EXIT]], label [[FOR_COND_PREHEADER]] +// CHECK-AMD: do.body.i: +// CHECK-AMD-NEXT: [[STACKPTR_0_I:%.*]] = phi i32 [ 1, [[FOR_COND4_PREHEADER]] ], [ [[DEC_I:%.*]], [[IF_END_I:%.*]] ] +// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP3]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I]] +// CHECK-AMD: if.then.i: +// CHECK-AMD-NEXT: [[INC2_I:%.*]] = add nsw i32 [[STACKPTR_0_I]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I:%.*]] = sext i32 [[STACKPTR_0_I]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I]], i64 0, i64 [[IDXPROM3_I]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I]] +// CHECK-AMD: if.end.i: +// CHECK-AMD-NEXT: [[STACKPTR_1_I:%.*]] = phi i32 [ [[INC2_I]], [[IF_THEN_I]] ], [ [[STACKPTR_0_I]], [[DO_BODY_I]] ] +// CHECK-AMD-NEXT: [[DEC_I]] = add nsw i32 [[STACKPTR_1_I]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I:%.*]] = sext i32 [[DEC_I]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I]], i64 0, i64 [[IDXPROM5_I]] +// CHECK-AMD-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX6_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I]]) #[[ATTR13:[0-9]+]] +// CHECK-AMD-NEXT: [[CMP8_I:%.*]] = icmp sgt i32 [[TMP4]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I]], label [[DO_BODY_I]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_1:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_1]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1:%.*]] +// CHECK-AMD: do.body.i.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT]] ], [ [[DEC_I_1:%.*]], [[IF_END_I_1:%.*]] ] +// CHECK-AMD-NEXT: [[TMP5:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_1:%.*]] = icmp sgt i32 [[TMP5]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_1]], label [[IF_THEN_I_1:%.*]], label [[IF_END_I_1]] +// CHECK-AMD: if.then.i.1: +// CHECK-AMD-NEXT: [[INC2_I_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_1:%.*]] = sext i32 [[STACKPTR_0_I_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1]], i64 0, i64 [[IDXPROM3_I_1]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1]] +// CHECK-AMD: if.end.i.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1:%.*]] = phi i32 [ [[INC2_I_1]], [[IF_THEN_I_1]] ], [ [[STACKPTR_0_I_1]], [[DO_BODY_I_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_1]] = add nsw i32 [[STACKPTR_1_I_1]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_1:%.*]] = sext i32 [[DEC_I_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1]], i64 0, i64 [[IDXPROM5_I_1]] +// CHECK-AMD-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX6_I_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_1]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_1:%.*]] = icmp sgt i32 [[TMP6]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_1]], label [[DO_BODY_I_1]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.1: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_1]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_2:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_2]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2:%.*]] +// CHECK-AMD: do.body.i.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1]] ], [ [[DEC_I_2:%.*]], [[IF_END_I_2:%.*]] ] +// CHECK-AMD-NEXT: [[TMP7:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_2:%.*]] = icmp sgt i32 [[TMP7]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_2]], label [[IF_THEN_I_2:%.*]], label [[IF_END_I_2]] +// CHECK-AMD: if.then.i.2: +// CHECK-AMD-NEXT: [[INC2_I_2:%.*]] = add nsw i32 [[STACKPTR_0_I_2]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_2:%.*]] = sext i32 [[STACKPTR_0_I_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2]], i64 0, i64 [[IDXPROM3_I_2]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2]] +// CHECK-AMD: if.end.i.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2:%.*]] = phi i32 [ [[INC2_I_2]], [[IF_THEN_I_2]] ], [ [[STACKPTR_0_I_2]], [[DO_BODY_I_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_2]] = add nsw i32 [[STACKPTR_1_I_2]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_2:%.*]] = sext i32 [[DEC_I_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2]], i64 0, i64 [[IDXPROM5_I_2]] +// CHECK-AMD-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6_I_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_2]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_2:%.*]] = icmp sgt i32 [[TMP8]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_2]], label [[DO_BODY_I_2]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.2: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_2]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_3:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_3]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3:%.*]] +// CHECK-AMD: do.body.i.3: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2]] ], [ [[DEC_I_3:%.*]], [[IF_END_I_3:%.*]] ] +// CHECK-AMD-NEXT: [[TMP9:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_3:%.*]] = icmp sgt i32 [[TMP9]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_3]], label [[IF_THEN_I_3:%.*]], label [[IF_END_I_3]] +// CHECK-AMD: if.then.i.3: +// CHECK-AMD-NEXT: [[INC2_I_3:%.*]] = add nsw i32 [[STACKPTR_0_I_3]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_3:%.*]] = sext i32 [[STACKPTR_0_I_3]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3]], i64 0, i64 [[IDXPROM3_I_3]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_3]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3]] +// CHECK-AMD: if.end.i.3: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3:%.*]] = phi i32 [ [[INC2_I_3]], [[IF_THEN_I_3]] ], [ [[STACKPTR_0_I_3]], [[DO_BODY_I_3]] ] +// CHECK-AMD-NEXT: [[DEC_I_3]] = add nsw i32 [[STACKPTR_1_I_3]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_3:%.*]] = sext i32 [[DEC_I_3]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3]], i64 0, i64 [[IDXPROM5_I_3]] +// CHECK-AMD-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6_I_3]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_3]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_3:%.*]] = icmp sgt i32 [[TMP10]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_3]], label [[DO_BODY_I_3]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.3: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_3]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_133:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_133]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_134:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_133]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_137:%.*]] +// CHECK-AMD: do.body.i.137: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_135:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3]] ], [ [[DEC_I_143:%.*]], [[IF_END_I_147:%.*]] ] +// CHECK-AMD-NEXT: [[TMP11:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_136:%.*]] = icmp sgt i32 [[TMP11]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_136]], label [[IF_THEN_I_141:%.*]], label [[IF_END_I_147]] +// CHECK-AMD: if.then.i.141: +// CHECK-AMD-NEXT: [[INC2_I_138:%.*]] = add nsw i32 [[STACKPTR_0_I_135]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_139:%.*]] = sext i32 [[STACKPTR_0_I_135]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_140:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_133]], i64 0, i64 [[IDXPROM3_I_139]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_140]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_147]] +// CHECK-AMD: if.end.i.147: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_142:%.*]] = phi i32 [ [[INC2_I_138]], [[IF_THEN_I_141]] ], [ [[STACKPTR_0_I_135]], [[DO_BODY_I_137]] ] +// CHECK-AMD-NEXT: [[DEC_I_143]] = add nsw i32 [[STACKPTR_1_I_142]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_144:%.*]] = sext i32 [[DEC_I_143]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_145:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_133]], i64 0, i64 [[IDXPROM5_I_144]] +// CHECK-AMD-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX6_I_145]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_134]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_146:%.*]] = icmp sgt i32 [[TMP12]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_146]], label [[DO_BODY_I_137]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_148:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.148: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_133]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_1_1:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_1_1]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1_1]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_1:%.*]] +// CHECK-AMD: do.body.i.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_1:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_148]] ], [ [[DEC_I_1_1:%.*]], [[IF_END_I_1_1:%.*]] ] +// CHECK-AMD-NEXT: [[TMP13:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_1_1:%.*]] = icmp sgt i32 [[TMP13]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_1_1]], label [[IF_THEN_I_1_1:%.*]], label [[IF_END_I_1_1]] +// CHECK-AMD: if.then.i.1.1: +// CHECK-AMD-NEXT: [[INC2_I_1_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1_1]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_1_1:%.*]] = sext i32 [[STACKPTR_0_I_1_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1_1]], i64 0, i64 [[IDXPROM3_I_1_1]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_1]] +// CHECK-AMD: if.end.i.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_1:%.*]] = phi i32 [ [[INC2_I_1_1]], [[IF_THEN_I_1_1]] ], [ [[STACKPTR_0_I_1_1]], [[DO_BODY_I_1_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_1]] = add nsw i32 [[STACKPTR_1_I_1_1]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_1_1:%.*]] = sext i32 [[DEC_I_1_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1_1]], i64 0, i64 [[IDXPROM5_I_1_1]] +// CHECK-AMD-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX6_I_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_1_1]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_1_1:%.*]] = icmp sgt i32 [[TMP14]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_1_1]], label [[DO_BODY_I_1_1]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.1.1: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_1_1]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_2_1:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_2_1]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2_1]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_1:%.*]] +// CHECK-AMD: do.body.i.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_1:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1]] ], [ [[DEC_I_2_1:%.*]], [[IF_END_I_2_1:%.*]] ] +// CHECK-AMD-NEXT: [[TMP15:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_2_1:%.*]] = icmp sgt i32 [[TMP15]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_2_1]], label [[IF_THEN_I_2_1:%.*]], label [[IF_END_I_2_1]] +// CHECK-AMD: if.then.i.2.1: +// CHECK-AMD-NEXT: [[INC2_I_2_1:%.*]] = add nsw i32 [[STACKPTR_0_I_2_1]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_2_1:%.*]] = sext i32 [[STACKPTR_0_I_2_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2_1]], i64 0, i64 [[IDXPROM3_I_2_1]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_1]] +// CHECK-AMD: if.end.i.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_1:%.*]] = phi i32 [ [[INC2_I_2_1]], [[IF_THEN_I_2_1]] ], [ [[STACKPTR_0_I_2_1]], [[DO_BODY_I_2_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_1]] = add nsw i32 [[STACKPTR_1_I_2_1]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_2_1:%.*]] = sext i32 [[DEC_I_2_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2_1]], i64 0, i64 [[IDXPROM5_I_2_1]] +// CHECK-AMD-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX6_I_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_2_1]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_2_1:%.*]] = icmp sgt i32 [[TMP16]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_2_1]], label [[DO_BODY_I_2_1]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.2.1: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_2_1]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_3_1:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_3_1]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3_1]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_1:%.*]] +// CHECK-AMD: do.body.i.3.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_1:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1]] ], [ [[DEC_I_3_1:%.*]], [[IF_END_I_3_1:%.*]] ] +// CHECK-AMD-NEXT: [[TMP17:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_3_1:%.*]] = icmp sgt i32 [[TMP17]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_3_1]], label [[IF_THEN_I_3_1:%.*]], label [[IF_END_I_3_1]] +// CHECK-AMD: if.then.i.3.1: +// CHECK-AMD-NEXT: [[INC2_I_3_1:%.*]] = add nsw i32 [[STACKPTR_0_I_3_1]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_3_1:%.*]] = sext i32 [[STACKPTR_0_I_3_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3_1]], i64 0, i64 [[IDXPROM3_I_3_1]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_3_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_1]] +// CHECK-AMD: if.end.i.3.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_1:%.*]] = phi i32 [ [[INC2_I_3_1]], [[IF_THEN_I_3_1]] ], [ [[STACKPTR_0_I_3_1]], [[DO_BODY_I_3_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_1]] = add nsw i32 [[STACKPTR_1_I_3_1]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_3_1:%.*]] = sext i32 [[DEC_I_3_1]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3_1]], i64 0, i64 [[IDXPROM5_I_3_1]] +// CHECK-AMD-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX6_I_3_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_3_1]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_3_1:%.*]] = icmp sgt i32 [[TMP18]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_3_1]], label [[DO_BODY_I_3_1]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.3.1: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_3_1]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_249:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_249]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_250:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_249]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_253:%.*]] +// CHECK-AMD: do.body.i.253: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_251:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1]] ], [ [[DEC_I_259:%.*]], [[IF_END_I_263:%.*]] ] +// CHECK-AMD-NEXT: [[TMP19:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_252:%.*]] = icmp sgt i32 [[TMP19]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_252]], label [[IF_THEN_I_257:%.*]], label [[IF_END_I_263]] +// CHECK-AMD: if.then.i.257: +// CHECK-AMD-NEXT: [[INC2_I_254:%.*]] = add nsw i32 [[STACKPTR_0_I_251]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_255:%.*]] = sext i32 [[STACKPTR_0_I_251]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_256:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_249]], i64 0, i64 [[IDXPROM3_I_255]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_256]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_263]] +// CHECK-AMD: if.end.i.263: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_258:%.*]] = phi i32 [ [[INC2_I_254]], [[IF_THEN_I_257]] ], [ [[STACKPTR_0_I_251]], [[DO_BODY_I_253]] ] +// CHECK-AMD-NEXT: [[DEC_I_259]] = add nsw i32 [[STACKPTR_1_I_258]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_260:%.*]] = sext i32 [[DEC_I_259]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_261:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_249]], i64 0, i64 [[IDXPROM5_I_260]] +// CHECK-AMD-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX6_I_261]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_250]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_262:%.*]] = icmp sgt i32 [[TMP20]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_262]], label [[DO_BODY_I_253]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_264:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.264: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_249]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_1_2:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_1_2]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_1_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1_2]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_2:%.*]] +// CHECK-AMD: do.body.i.1.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_2:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_264]] ], [ [[DEC_I_1_2:%.*]], [[IF_END_I_1_2:%.*]] ] +// CHECK-AMD-NEXT: [[TMP21:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_1_2:%.*]] = icmp sgt i32 [[TMP21]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_1_2]], label [[IF_THEN_I_1_2:%.*]], label [[IF_END_I_1_2]] +// CHECK-AMD: if.then.i.1.2: +// CHECK-AMD-NEXT: [[INC2_I_1_2:%.*]] = add nsw i32 [[STACKPTR_0_I_1_2]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_1_2:%.*]] = sext i32 [[STACKPTR_0_I_1_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1_2]], i64 0, i64 [[IDXPROM3_I_1_2]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_1_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_2]] +// CHECK-AMD: if.end.i.1.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_2:%.*]] = phi i32 [ [[INC2_I_1_2]], [[IF_THEN_I_1_2]] ], [ [[STACKPTR_0_I_1_2]], [[DO_BODY_I_1_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_2]] = add nsw i32 [[STACKPTR_1_I_1_2]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_1_2:%.*]] = sext i32 [[DEC_I_1_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1_2]], i64 0, i64 [[IDXPROM5_I_1_2]] +// CHECK-AMD-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX6_I_1_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_1_2]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_1_2:%.*]] = icmp sgt i32 [[TMP22]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_1_2]], label [[DO_BODY_I_1_2]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_2:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.1.2: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_1_2]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_2_2:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_2_2]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_2_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2_2]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_2:%.*]] +// CHECK-AMD: do.body.i.2.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_2:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_2]] ], [ [[DEC_I_2_2:%.*]], [[IF_END_I_2_2:%.*]] ] +// CHECK-AMD-NEXT: [[TMP23:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_2_2:%.*]] = icmp sgt i32 [[TMP23]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_2_2]], label [[IF_THEN_I_2_2:%.*]], label [[IF_END_I_2_2]] +// CHECK-AMD: if.then.i.2.2: +// CHECK-AMD-NEXT: [[INC2_I_2_2:%.*]] = add nsw i32 [[STACKPTR_0_I_2_2]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_2_2:%.*]] = sext i32 [[STACKPTR_0_I_2_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2_2]], i64 0, i64 [[IDXPROM3_I_2_2]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_2_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_2]] +// CHECK-AMD: if.end.i.2.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_2:%.*]] = phi i32 [ [[INC2_I_2_2]], [[IF_THEN_I_2_2]] ], [ [[STACKPTR_0_I_2_2]], [[DO_BODY_I_2_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_2]] = add nsw i32 [[STACKPTR_1_I_2_2]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_2_2:%.*]] = sext i32 [[DEC_I_2_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2_2]], i64 0, i64 [[IDXPROM5_I_2_2]] +// CHECK-AMD-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX6_I_2_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_2_2]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_2_2:%.*]] = icmp sgt i32 [[TMP24]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_2_2]], label [[DO_BODY_I_2_2]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_2:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.2.2: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_2_2]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: [[STACK_I_3_2:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK_I_3_2]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7_I_3_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3_2]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_2:%.*]] +// CHECK-AMD: do.body.i.3.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_2:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_2]] ], [ [[DEC_I_3_2:%.*]], [[IF_END_I_3_2:%.*]] ] +// CHECK-AMD-NEXT: [[TMP25:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I_3_2:%.*]] = icmp sgt i32 [[TMP25]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP_I_3_2]], label [[IF_THEN_I_3_2:%.*]], label [[IF_END_I_3_2]] +// CHECK-AMD: if.then.i.3.2: +// CHECK-AMD-NEXT: [[INC2_I_3_2:%.*]] = add nsw i32 [[STACKPTR_0_I_3_2]], 1 +// CHECK-AMD-NEXT: [[IDXPROM3_I_3_2:%.*]] = sext i32 [[STACKPTR_0_I_3_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3_2]], i64 0, i64 [[IDXPROM3_I_3_2]] +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4_I_3_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_2]] +// CHECK-AMD: if.end.i.3.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_2:%.*]] = phi i32 [ [[INC2_I_3_2]], [[IF_THEN_I_3_2]] ], [ [[STACKPTR_0_I_3_2]], [[DO_BODY_I_3_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_2]] = add nsw i32 [[STACKPTR_1_I_3_2]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5_I_3_2:%.*]] = sext i32 [[DEC_I_3_2]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3_2]], i64 0, i64 [[IDXPROM5_I_3_2]] +// CHECK-AMD-NEXT: [[TMP26]] = load i32, ptr [[ARRAYIDX6_I_3_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_3_2]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[CMP8_I_3_2:%.*]] = icmp sgt i32 [[TMP26]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8_I_3_2]], label [[DO_BODY_I_3_2]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_2]], !llvm.loop [[LOOP15]] +// CHECK-AMD: _Z18emits_alloc_sharedPKiPi.internalized.exit.3.2: +// CHECK-AMD-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_3_2]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: br i1 [[CMP3]], label [[FOR_COND4_PREHEADER]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK-AMD: omp.loop.exit: +// CHECK-AMD-NEXT: call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP0]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: ret void +// +// +// CHECK-AMD-LABEL: define {{[^@]+}}@_Z18emits_alloc_sharedPKiPi +// CHECK-AMD-SAME: (ptr nocapture noundef readonly [[LOCALPADDING:%.*]], ptr nocapture noundef [[RES:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +// CHECK-AMD-NEXT: entry: +// CHECK-AMD-NEXT: [[STACK:%.*]] = tail call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 -1, ptr [[STACK]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: store i32 0, ptr [[RES]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK]], i64 0, i64 2 +// CHECK-AMD-NEXT: br label [[DO_BODY:%.*]] +// CHECK-AMD: do.body: +// CHECK-AMD-NEXT: [[STACKPTR_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[IF_END:%.*]] ] +// CHECK-AMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +// CHECK-AMD: if.then: +// CHECK-AMD-NEXT: [[IDXPROM3:%.*]] = sext i32 [[STACKPTR_0]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK]], i64 0, i64 [[IDXPROM3]] +// CHECK-AMD-NEXT: [[INC2:%.*]] = add nsw i32 [[STACKPTR_0]], 1 +// CHECK-AMD-NEXT: store i32 0, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END]] +// CHECK-AMD: if.end: +// CHECK-AMD-NEXT: [[STACKPTR_1:%.*]] = phi i32 [ [[INC2]], [[IF_THEN]] ], [ [[STACKPTR_0]], [[DO_BODY]] ] +// CHECK-AMD-NEXT: [[DEC]] = add nsw i32 [[STACKPTR_1]], -1 +// CHECK-AMD-NEXT: [[IDXPROM5:%.*]] = sext i32 [[DEC]] to i64 +// CHECK-AMD-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK]], i64 0, i64 [[IDXPROM5]] +// CHECK-AMD-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr [[RES]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: tail call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7]]) #[[ATTR13]] +// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr [[RES]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP2]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP8]], label [[DO_BODY]], label [[DO_END:%.*]], !llvm.loop [[LOOP15]] +// CHECK-AMD: do.end: +// CHECK-AMD-NEXT: tail call void @__kmpc_free_shared(ptr nonnull [[STACK]], i64 256) #[[ATTR2]] +// CHECK-AMD-NEXT: ret void +// +// +// CHECK-AMD-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77 +// CHECK-AMD-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 noundef [[RES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-AMD-NEXT: entry: +// CHECK-AMD-NEXT: [[DOTOMP_COMB_LB_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_COMB_UB_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [4 x ptr], align 8, addrspace(5) +// CHECK-AMD-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2, i1 false) #[[ATTR2]] +// CHECK-AMD-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +// CHECK-AMD-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +// CHECK-AMD: common.ret: +// CHECK-AMD-NEXT: ret void +// CHECK-AMD: user_code.entry: +// CHECK-AMD-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-AMD-NEXT: [[DOTOMP_COMB_LB_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB_I]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_COMB_UB_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB_I]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE_I]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST_I]] to ptr +// CHECK-AMD-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]] to ptr +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_LB_I]]) #[[ATTR12]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 999, ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 1, ptr addrspace(5) [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[DOTOMP_IS_LAST_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[NVPTX_NUM_THREADS_I:%.*]] = tail call i32 @__kmpc_get_hardware_num_threads_in_block() #[[ATTR2]] +// CHECK-AMD-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP1]], i32 91, ptr nonnull [[DOTOMP_IS_LAST_ASCAST_I]], ptr nonnull [[DOTOMP_COMB_LB_ASCAST_I]], ptr nonnull [[DOTOMP_COMB_UB_ASCAST_I]], ptr nonnull [[DOTOMP_STRIDE_ASCAST_I]], i32 1, i32 [[NVPTX_NUM_THREADS_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4 +// CHECK-AMD-NEXT: [[COND_I:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 999) +// CHECK-AMD-NEXT: store i32 [[COND_I]], ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[DOTOMP_IV_01_I:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4 +// CHECK-AMD-NEXT: [[CMP12_I:%.*]] = icmp slt i32 [[DOTOMP_IV_01_I]], 1000 +// CHECK-AMD-NEXT: br i1 [[CMP12_I]], label [[OMP_INNER_FOR_BODY_LR_PH_I:%.*]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined.exit +// CHECK-AMD: omp.inner.for.body.lr.ph.i: +// CHECK-AMD-NEXT: [[RES_CASTED_SROA_0_0_INSERT_EXT:%.*]] = and i64 [[RES]], 4294967295 +// CHECK-AMD-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], i32 0, i32 1 +// CHECK-AMD-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], i32 0, i32 2 +// CHECK-AMD-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], i32 0, i32 3 +// CHECK-AMD-NEXT: [[TMP6:%.*]] = inttoptr i64 [[RES_CASTED_SROA_0_0_INSERT_EXT]] to ptr +// CHECK-AMD-NEXT: br label [[OMP_INNER_FOR_BODY_I:%.*]] +// CHECK-AMD: omp.inner.for.body.i: +// CHECK-AMD-NEXT: [[DOTOMP_IV_04_I:%.*]] = phi i32 [ [[DOTOMP_IV_01_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[ADD2_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-AMD-NEXT: [[STOREMERGE3_I:%.*]] = phi i32 [ [[COND_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[COND8_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-AMD-NEXT: [[TMP7:%.*]] = zext i32 [[DOTOMP_IV_04_I]] to i64 +// CHECK-AMD-NEXT: [[TMP8:%.*]] = zext i32 [[STOREMERGE3_I]] to i64 +// CHECK-AMD-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP7]] to ptr +// CHECK-AMD-NEXT: store ptr [[TMP9]], ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr +// CHECK-AMD-NEXT: store ptr [[TMP10]], ptr addrspace(5) [[TMP3]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: store ptr [[LOCALPADDING]], ptr addrspace(5) [[TMP4]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: store ptr [[TMP6]], ptr addrspace(5) [[TMP5]], align 8, !tbaa [[TBAA13]] +// CHECK-AMD-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined_omp_outlined, ptr null, ptr nonnull [[CAPTURED_VARS_ADDRS_ASCAST_I]], i64 4) #[[ATTR2]] +// CHECK-AMD-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ADD2_I]] = add nsw i32 [[TMP12]], [[TMP11]] +// CHECK-AMD-NEXT: store i32 [[ADD2_I]], ptr addrspace(5) [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ADD3_I:%.*]] = add nsw i32 [[TMP13]], [[TMP11]] +// CHECK-AMD-NEXT: [[COND8_I]] = call i32 @llvm.smin.i32(i32 [[ADD3_I]], i32 999) +// CHECK-AMD-NEXT: store i32 [[COND8_I]], ptr addrspace(5) [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP1_I:%.*]] = icmp slt i32 [[ADD2_I]], 1000 +// CHECK-AMD-NEXT: br i1 [[CMP1_I]], label [[OMP_INNER_FOR_BODY_I]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined.exit +// CHECK-AMD: {{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined.exit: +// CHECK-AMD-NEXT: call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP1]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_COMB_LB_I]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-AMD-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2) #[[ATTR2]] +// CHECK-AMD-NEXT: br label [[COMMON_RET]] +// +// +// CHECK-AMD-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined_omp_outlined +// CHECK-AMD-SAME: (ptr noalias nocapture noundef readonly [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture readnone [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr nocapture noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 [[RES:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK-AMD-NEXT: entry: +// CHECK-AMD-NEXT: [[STACK_H2S1_I:%.*]] = alloca [256 x i8], align 8, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-AMD-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr +// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: [[CONV:%.*]] = trunc i64 [[DOTPREVIOUS_LB_]] to i32 +// CHECK-AMD-NEXT: [[CONV1:%.*]] = trunc i64 [[DOTPREVIOUS_UB_]] to i32 +// CHECK-AMD-NEXT: store i32 [[CONV]], ptr addrspace(5) [[DOTOMP_LB]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: store i32 [[CONV1]], ptr addrspace(5) [[DOTOMP_UB]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 1, ptr addrspace(5) [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP0]], i32 33, ptr nonnull [[DOTOMP_IS_LAST_ASCAST]], ptr nonnull [[DOTOMP_LB_ASCAST]], ptr nonnull [[DOTOMP_UB_ASCAST]], ptr nonnull [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1) #[[ATTR2]] +// CHECK-AMD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_LB]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CONV228:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-AMD-NEXT: [[CMP_NOT29:%.*]] = icmp ugt i64 [[CONV228]], [[DOTPREVIOUS_UB_]] +// CHECK-AMD-NEXT: br i1 [[CMP_NOT29]], label [[OMP_LOOP_EXIT:%.*]], label [[FOR_COND_PREHEADER_LR_PH:%.*]] +// CHECK-AMD: for.cond.preheader.lr.ph: +// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[FOR_COND_PREHEADER:%.*]] +// CHECK-AMD: for.cond.preheader: +// CHECK-AMD-NEXT: [[CONV231:%.*]] = phi i64 [ [[CONV228]], [[FOR_COND_PREHEADER_LR_PH]] ], [ [[CONV2:%.*]], [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_2_1:%.*]] ] +// CHECK-AMD-NEXT: [[INDVARS:%.*]] = trunc i64 [[CONV231]] to i32 +// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP3]], 0 +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I:%.*]] +// CHECK-AMD: do.body.i: +// CHECK-AMD-NEXT: [[STACKPTR_0_I:%.*]] = phi i32 [ 1, [[FOR_COND_PREHEADER]] ], [ [[DEC_I:%.*]], [[IF_END_I:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I]] +// CHECK-AMD: if.then.i: +// CHECK-AMD-NEXT: [[INC2_I:%.*]] = add nsw i32 [[STACKPTR_0_I]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I]] +// CHECK-AMD: if.end.i: +// CHECK-AMD-NEXT: [[STACKPTR_1_I:%.*]] = phi i32 [ [[INC2_I]], [[IF_THEN_I]] ], [ [[STACKPTR_0_I]], [[DO_BODY_I]] ] +// CHECK-AMD-NEXT: [[DEC_I]] = add nsw i32 [[STACKPTR_1_I]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I]] +// CHECK-AMD-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I:%.*]] = icmp sgt i32 [[TMP4]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I]], label [[DO_BODY_I]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT:%.*]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1:%.*]] +// CHECK-AMD: do.body.i.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT]] ], [ [[DEC_I_1:%.*]], [[IF_END_I_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1:%.*]], label [[IF_END_I_1]] +// CHECK-AMD: if.then.i.1: +// CHECK-AMD-NEXT: [[INC2_I_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1]] +// CHECK-AMD: if.end.i.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1:%.*]] = phi i32 [ [[INC2_I_1]], [[IF_THEN_I_1]] ], [ [[STACKPTR_0_I_1]], [[DO_BODY_I_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_1]] = add nsw i32 [[STACKPTR_1_I_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_1]] +// CHECK-AMD-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_1:%.*]] = icmp sgt i32 [[TMP5]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_1]], label [[DO_BODY_I_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2:%.*]] +// CHECK-AMD: do.body.i.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1]] ], [ [[DEC_I_2:%.*]], [[IF_END_I_2:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2:%.*]], label [[IF_END_I_2]] +// CHECK-AMD: if.then.i.2: +// CHECK-AMD-NEXT: [[INC2_I_2:%.*]] = add nsw i32 [[STACKPTR_0_I_2]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2]] +// CHECK-AMD: if.end.i.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2:%.*]] = phi i32 [ [[INC2_I_2]], [[IF_THEN_I_2]] ], [ [[STACKPTR_0_I_2]], [[DO_BODY_I_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_2]] = add nsw i32 [[STACKPTR_1_I_2]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_2]] +// CHECK-AMD-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_2:%.*]] = icmp sgt i32 [[TMP6]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_2]], label [[DO_BODY_I_2]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3:%.*]] +// CHECK-AMD: do.body.i.3: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2]] ], [ [[DEC_I_3:%.*]], [[IF_END_I_3:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3:%.*]], label [[IF_END_I_3]] +// CHECK-AMD: if.then.i.3: +// CHECK-AMD-NEXT: [[INC2_I_3:%.*]] = add nsw i32 [[STACKPTR_0_I_3]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_3]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_3]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3]] +// CHECK-AMD: if.end.i.3: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3:%.*]] = phi i32 [ [[INC2_I_3]], [[IF_THEN_I_3]] ], [ [[STACKPTR_0_I_3]], [[DO_BODY_I_3]] ] +// CHECK-AMD-NEXT: [[DEC_I_3]] = add nsw i32 [[STACKPTR_1_I_3]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_3]] +// CHECK-AMD-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_3]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_3:%.*]] = icmp sgt i32 [[TMP7]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_3]], label [[DO_BODY_I_3]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_133:%.*]] +// CHECK-AMD: do.body.i.133: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_132:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3]] ], [ [[DEC_I_138:%.*]], [[IF_END_I_141:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_136:%.*]], label [[IF_END_I_141]] +// CHECK-AMD: if.then.i.136: +// CHECK-AMD-NEXT: [[INC2_I_134:%.*]] = add nsw i32 [[STACKPTR_0_I_132]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_135:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_132]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_135]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_141]] +// CHECK-AMD: if.end.i.141: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_137:%.*]] = phi i32 [ [[INC2_I_134]], [[IF_THEN_I_136]] ], [ [[STACKPTR_0_I_132]], [[DO_BODY_I_133]] ] +// CHECK-AMD-NEXT: [[DEC_I_138]] = add nsw i32 [[STACKPTR_1_I_137]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_139:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_138]] +// CHECK-AMD-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_139]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_140:%.*]] = icmp sgt i32 [[TMP8]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_140]], label [[DO_BODY_I_133]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_142:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.142: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_1:%.*]] +// CHECK-AMD: do.body.i.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_142]] ], [ [[DEC_I_1_1:%.*]], [[IF_END_I_1_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1_1:%.*]], label [[IF_END_I_1_1]] +// CHECK-AMD: if.then.i.1.1: +// CHECK-AMD-NEXT: [[INC2_I_1_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_1_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_1]] +// CHECK-AMD: if.end.i.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_1:%.*]] = phi i32 [ [[INC2_I_1_1]], [[IF_THEN_I_1_1]] ], [ [[STACKPTR_0_I_1_1]], [[DO_BODY_I_1_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_1]] = add nsw i32 [[STACKPTR_1_I_1_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_1_1]] +// CHECK-AMD-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_1_1:%.*]] = icmp sgt i32 [[TMP9]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_1_1]], label [[DO_BODY_I_1_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_1:%.*]] +// CHECK-AMD: do.body.i.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1]] ], [ [[DEC_I_2_1:%.*]], [[IF_END_I_2_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2_1:%.*]], label [[IF_END_I_2_1]] +// CHECK-AMD: if.then.i.2.1: +// CHECK-AMD-NEXT: [[INC2_I_2_1:%.*]] = add nsw i32 [[STACKPTR_0_I_2_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_2_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_1]] +// CHECK-AMD: if.end.i.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_1:%.*]] = phi i32 [ [[INC2_I_2_1]], [[IF_THEN_I_2_1]] ], [ [[STACKPTR_0_I_2_1]], [[DO_BODY_I_2_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_1]] = add nsw i32 [[STACKPTR_1_I_2_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_2_1]] +// CHECK-AMD-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_2_1:%.*]] = icmp sgt i32 [[TMP10]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_2_1]], label [[DO_BODY_I_2_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_1:%.*]] +// CHECK-AMD: do.body.i.3.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1]] ], [ [[DEC_I_3_1:%.*]], [[IF_END_I_3_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3_1:%.*]], label [[IF_END_I_3_1]] +// CHECK-AMD: if.then.i.3.1: +// CHECK-AMD-NEXT: [[INC2_I_3_1:%.*]] = add nsw i32 [[STACKPTR_0_I_3_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_3_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_3_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_1]] +// CHECK-AMD: if.end.i.3.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_1:%.*]] = phi i32 [ [[INC2_I_3_1]], [[IF_THEN_I_3_1]] ], [ [[STACKPTR_0_I_3_1]], [[DO_BODY_I_3_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_1]] = add nsw i32 [[STACKPTR_1_I_3_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_3_1]] +// CHECK-AMD-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_3_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_3_1:%.*]] = icmp sgt i32 [[TMP11]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_3_1]], label [[DO_BODY_I_3_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_244:%.*]] +// CHECK-AMD: do.body.i.244: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_243:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1]] ], [ [[DEC_I_249:%.*]], [[IF_END_I_252:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_247:%.*]], label [[IF_END_I_252]] +// CHECK-AMD: if.then.i.247: +// CHECK-AMD-NEXT: [[INC2_I_245:%.*]] = add nsw i32 [[STACKPTR_0_I_243]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_246:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_243]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_246]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_252]] +// CHECK-AMD: if.end.i.252: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_248:%.*]] = phi i32 [ [[INC2_I_245]], [[IF_THEN_I_247]] ], [ [[STACKPTR_0_I_243]], [[DO_BODY_I_244]] ] +// CHECK-AMD-NEXT: [[DEC_I_249]] = add nsw i32 [[STACKPTR_1_I_248]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_250:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_249]] +// CHECK-AMD-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_250]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_251:%.*]] = icmp sgt i32 [[TMP12]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_251]], label [[DO_BODY_I_244]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_253:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.253: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_2:%.*]] +// CHECK-AMD: do.body.i.1.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_2:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_253]] ], [ [[DEC_I_1_2:%.*]], [[IF_END_I_1_2:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1_2:%.*]], label [[IF_END_I_1_2]] +// CHECK-AMD: if.then.i.1.2: +// CHECK-AMD-NEXT: [[INC2_I_1_2:%.*]] = add nsw i32 [[STACKPTR_0_I_1_2]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_1_2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_1_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_2]] +// CHECK-AMD: if.end.i.1.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_2:%.*]] = phi i32 [ [[INC2_I_1_2]], [[IF_THEN_I_1_2]] ], [ [[STACKPTR_0_I_1_2]], [[DO_BODY_I_1_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_2]] = add nsw i32 [[STACKPTR_1_I_1_2]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_1_2]] +// CHECK-AMD-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_1_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_1_2:%.*]] = icmp sgt i32 [[TMP13]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_1_2]], label [[DO_BODY_I_1_2]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_2:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1.2: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_2:%.*]] +// CHECK-AMD: do.body.i.2.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_2:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_2]] ], [ [[DEC_I_2_2:%.*]], [[IF_END_I_2_2:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2_2:%.*]], label [[IF_END_I_2_2]] +// CHECK-AMD: if.then.i.2.2: +// CHECK-AMD-NEXT: [[INC2_I_2_2:%.*]] = add nsw i32 [[STACKPTR_0_I_2_2]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_2_2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_2_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_2]] +// CHECK-AMD: if.end.i.2.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_2:%.*]] = phi i32 [ [[INC2_I_2_2]], [[IF_THEN_I_2_2]] ], [ [[STACKPTR_0_I_2_2]], [[DO_BODY_I_2_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_2]] = add nsw i32 [[STACKPTR_1_I_2_2]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_2_2]] +// CHECK-AMD-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_2_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_2_2:%.*]] = icmp sgt i32 [[TMP14]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_2_2]], label [[DO_BODY_I_2_2]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_2:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2.2: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_2:%.*]] +// CHECK-AMD: do.body.i.3.2: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_2:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_2]] ], [ [[DEC_I_3_2:%.*]], [[IF_END_I_3_2:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3_2:%.*]], label [[IF_END_I_3_2]] +// CHECK-AMD: if.then.i.3.2: +// CHECK-AMD-NEXT: [[INC2_I_3_2:%.*]] = add nsw i32 [[STACKPTR_0_I_3_2]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_3_2]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_3_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_2]] +// CHECK-AMD: if.end.i.3.2: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_2:%.*]] = phi i32 [ [[INC2_I_3_2]], [[IF_THEN_I_3_2]] ], [ [[STACKPTR_0_I_3_2]], [[DO_BODY_I_3_2]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_2]] = add nsw i32 [[STACKPTR_1_I_3_2]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_2:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_3_2]] +// CHECK-AMD-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_3_2]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_3_2:%.*]] = icmp sgt i32 [[TMP15]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_3_2]], label [[DO_BODY_I_3_2]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_2:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3.2: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_156:%.*]] +// CHECK-AMD: do.body.i.156: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_155:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_2]] ], [ [[DEC_I_161:%.*]], [[IF_END_I_164:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_159:%.*]], label [[IF_END_I_164]] +// CHECK-AMD: if.then.i.159: +// CHECK-AMD-NEXT: [[INC2_I_157:%.*]] = add nsw i32 [[STACKPTR_0_I_155]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_158:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_155]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_158]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_164]] +// CHECK-AMD: if.end.i.164: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_160:%.*]] = phi i32 [ [[INC2_I_157]], [[IF_THEN_I_159]] ], [ [[STACKPTR_0_I_155]], [[DO_BODY_I_156]] ] +// CHECK-AMD-NEXT: [[DEC_I_161]] = add nsw i32 [[STACKPTR_1_I_160]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_162:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_161]] +// CHECK-AMD-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_162]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_163:%.*]] = icmp sgt i32 [[TMP16]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_163]], label [[DO_BODY_I_156]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_165:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.165: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_167:%.*]] +// CHECK-AMD: do.body.i.1.167: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_166:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_165]] ], [ [[DEC_I_1_172:%.*]], [[IF_END_I_1_175:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1_170:%.*]], label [[IF_END_I_1_175]] +// CHECK-AMD: if.then.i.1.170: +// CHECK-AMD-NEXT: [[INC2_I_1_168:%.*]] = add nsw i32 [[STACKPTR_0_I_1_166]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_169:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_1_166]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_1_169]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_175]] +// CHECK-AMD: if.end.i.1.175: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_171:%.*]] = phi i32 [ [[INC2_I_1_168]], [[IF_THEN_I_1_170]] ], [ [[STACKPTR_0_I_1_166]], [[DO_BODY_I_1_167]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_172]] = add nsw i32 [[STACKPTR_1_I_1_171]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_173:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_1_172]] +// CHECK-AMD-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_1_173]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_1_174:%.*]] = icmp sgt i32 [[TMP17]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_1_174]], label [[DO_BODY_I_1_167]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_176:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1.176: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_178:%.*]] +// CHECK-AMD: do.body.i.2.178: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_177:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_176]] ], [ [[DEC_I_2_183:%.*]], [[IF_END_I_2_186:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2_181:%.*]], label [[IF_END_I_2_186]] +// CHECK-AMD: if.then.i.2.181: +// CHECK-AMD-NEXT: [[INC2_I_2_179:%.*]] = add nsw i32 [[STACKPTR_0_I_2_177]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_180:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_2_177]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_2_180]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_186]] +// CHECK-AMD: if.end.i.2.186: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_182:%.*]] = phi i32 [ [[INC2_I_2_179]], [[IF_THEN_I_2_181]] ], [ [[STACKPTR_0_I_2_177]], [[DO_BODY_I_2_178]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_183]] = add nsw i32 [[STACKPTR_1_I_2_182]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_184:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_2_183]] +// CHECK-AMD-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_2_184]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_2_185:%.*]] = icmp sgt i32 [[TMP18]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_2_185]], label [[DO_BODY_I_2_178]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_187:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2.187: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_189:%.*]] +// CHECK-AMD: do.body.i.3.189: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_188:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_187]] ], [ [[DEC_I_3_194:%.*]], [[IF_END_I_3_197:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3_192:%.*]], label [[IF_END_I_3_197]] +// CHECK-AMD: if.then.i.3.192: +// CHECK-AMD-NEXT: [[INC2_I_3_190:%.*]] = add nsw i32 [[STACKPTR_0_I_3_188]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_191:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_3_188]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_3_191]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_197]] +// CHECK-AMD: if.end.i.3.197: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_193:%.*]] = phi i32 [ [[INC2_I_3_190]], [[IF_THEN_I_3_192]] ], [ [[STACKPTR_0_I_3_188]], [[DO_BODY_I_3_189]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_194]] = add nsw i32 [[STACKPTR_1_I_3_193]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_195:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_3_194]] +// CHECK-AMD-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_3_195]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_3_196:%.*]] = icmp sgt i32 [[TMP19]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_3_196]], label [[DO_BODY_I_3_189]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_198:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3.198: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_133_1:%.*]] +// CHECK-AMD: do.body.i.133.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_132_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_198]] ], [ [[DEC_I_138_1:%.*]], [[IF_END_I_141_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_136_1:%.*]], label [[IF_END_I_141_1]] +// CHECK-AMD: if.then.i.136.1: +// CHECK-AMD-NEXT: [[INC2_I_134_1:%.*]] = add nsw i32 [[STACKPTR_0_I_132_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_135_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_132_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_135_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_141_1]] +// CHECK-AMD: if.end.i.141.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_137_1:%.*]] = phi i32 [ [[INC2_I_134_1]], [[IF_THEN_I_136_1]] ], [ [[STACKPTR_0_I_132_1]], [[DO_BODY_I_133_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_138_1]] = add nsw i32 [[STACKPTR_1_I_137_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_139_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_138_1]] +// CHECK-AMD-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_139_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_140_1:%.*]] = icmp sgt i32 [[TMP20]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_140_1]], label [[DO_BODY_I_133_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_142_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.142.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_1_1:%.*]] +// CHECK-AMD: do.body.i.1.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_1_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_142_1]] ], [ [[DEC_I_1_1_1:%.*]], [[IF_END_I_1_1_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1_1_1:%.*]], label [[IF_END_I_1_1_1]] +// CHECK-AMD: if.then.i.1.1.1: +// CHECK-AMD-NEXT: [[INC2_I_1_1_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1_1_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_1_1_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_1_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_1_1]] +// CHECK-AMD: if.end.i.1.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_1_1:%.*]] = phi i32 [ [[INC2_I_1_1_1]], [[IF_THEN_I_1_1_1]] ], [ [[STACKPTR_0_I_1_1_1]], [[DO_BODY_I_1_1_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_1_1]] = add nsw i32 [[STACKPTR_1_I_1_1_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_1_1_1]] +// CHECK-AMD-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_1_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_1_1_1:%.*]] = icmp sgt i32 [[TMP21]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_1_1_1]], label [[DO_BODY_I_1_1_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1.1.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_1_1:%.*]] +// CHECK-AMD: do.body.i.2.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_1_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1_1]] ], [ [[DEC_I_2_1_1:%.*]], [[IF_END_I_2_1_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2_1_1:%.*]], label [[IF_END_I_2_1_1]] +// CHECK-AMD: if.then.i.2.1.1: +// CHECK-AMD-NEXT: [[INC2_I_2_1_1:%.*]] = add nsw i32 [[STACKPTR_0_I_2_1_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_2_1_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_2_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_1_1]] +// CHECK-AMD: if.end.i.2.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_1_1:%.*]] = phi i32 [ [[INC2_I_2_1_1]], [[IF_THEN_I_2_1_1]] ], [ [[STACKPTR_0_I_2_1_1]], [[DO_BODY_I_2_1_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_1_1]] = add nsw i32 [[STACKPTR_1_I_2_1_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_2_1_1]] +// CHECK-AMD-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_2_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_2_1_1:%.*]] = icmp sgt i32 [[TMP22]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_2_1_1]], label [[DO_BODY_I_2_1_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2.1.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_1_1:%.*]] +// CHECK-AMD: do.body.i.3.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_1_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1_1]] ], [ [[DEC_I_3_1_1:%.*]], [[IF_END_I_3_1_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3_1_1:%.*]], label [[IF_END_I_3_1_1]] +// CHECK-AMD: if.then.i.3.1.1: +// CHECK-AMD-NEXT: [[INC2_I_3_1_1:%.*]] = add nsw i32 [[STACKPTR_0_I_3_1_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_3_1_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_3_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_1_1]] +// CHECK-AMD: if.end.i.3.1.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_1_1:%.*]] = phi i32 [ [[INC2_I_3_1_1]], [[IF_THEN_I_3_1_1]] ], [ [[STACKPTR_0_I_3_1_1]], [[DO_BODY_I_3_1_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_1_1]] = add nsw i32 [[STACKPTR_1_I_3_1_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_3_1_1]] +// CHECK-AMD-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_3_1_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_3_1_1:%.*]] = icmp sgt i32 [[TMP23]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_3_1_1]], label [[DO_BODY_I_3_1_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3.1.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_244_1:%.*]] +// CHECK-AMD: do.body.i.244.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_243_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1_1]] ], [ [[DEC_I_249_1:%.*]], [[IF_END_I_252_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_247_1:%.*]], label [[IF_END_I_252_1]] +// CHECK-AMD: if.then.i.247.1: +// CHECK-AMD-NEXT: [[INC2_I_245_1:%.*]] = add nsw i32 [[STACKPTR_0_I_243_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_246_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_243_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_246_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_252_1]] +// CHECK-AMD: if.end.i.252.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_248_1:%.*]] = phi i32 [ [[INC2_I_245_1]], [[IF_THEN_I_247_1]] ], [ [[STACKPTR_0_I_243_1]], [[DO_BODY_I_244_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_249_1]] = add nsw i32 [[STACKPTR_1_I_248_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_250_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_249_1]] +// CHECK-AMD-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_250_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_251_1:%.*]] = icmp sgt i32 [[TMP24]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_251_1]], label [[DO_BODY_I_244_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_253_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.253.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_1_2_1:%.*]] +// CHECK-AMD: do.body.i.1.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_1_2_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_253_1]] ], [ [[DEC_I_1_2_1:%.*]], [[IF_END_I_1_2_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1_2_1:%.*]], label [[IF_END_I_1_2_1]] +// CHECK-AMD: if.then.i.1.2.1: +// CHECK-AMD-NEXT: [[INC2_I_1_2_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1_2_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_1_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_1_2_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_1_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_1_2_1]] +// CHECK-AMD: if.end.i.1.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_1_2_1:%.*]] = phi i32 [ [[INC2_I_1_2_1]], [[IF_THEN_I_1_2_1]] ], [ [[STACKPTR_0_I_1_2_1]], [[DO_BODY_I_1_2_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_1_2_1]] = add nsw i32 [[STACKPTR_1_I_1_2_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_1_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_1_2_1]] +// CHECK-AMD-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_1_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_1_2_1:%.*]] = icmp sgt i32 [[TMP25]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_1_2_1]], label [[DO_BODY_I_1_2_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_2_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1.2.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_2_2_1:%.*]] +// CHECK-AMD: do.body.i.2.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_2_2_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_2_1]] ], [ [[DEC_I_2_2_1:%.*]], [[IF_END_I_2_2_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2_2_1:%.*]], label [[IF_END_I_2_2_1]] +// CHECK-AMD: if.then.i.2.2.1: +// CHECK-AMD-NEXT: [[INC2_I_2_2_1:%.*]] = add nsw i32 [[STACKPTR_0_I_2_2_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_2_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_2_2_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_2_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_2_2_1]] +// CHECK-AMD: if.end.i.2.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_2_2_1:%.*]] = phi i32 [ [[INC2_I_2_2_1]], [[IF_THEN_I_2_2_1]] ], [ [[STACKPTR_0_I_2_2_1]], [[DO_BODY_I_2_2_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_2_2_1]] = add nsw i32 [[STACKPTR_1_I_2_2_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_2_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_2_2_1]] +// CHECK-AMD-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_2_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_2_2_1:%.*]] = icmp sgt i32 [[TMP26]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_2_2_1]], label [[DO_BODY_I_2_2_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_2_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2.2.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S1_I]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY_I_3_2_1:%.*]] +// CHECK-AMD: do.body.i.3.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_0_I_3_2_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_2_1]] ], [ [[DEC_I_3_2_1:%.*]], [[IF_END_I_3_2_1:%.*]] ] +// CHECK-AMD-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3_2_1:%.*]], label [[IF_END_I_3_2_1]] +// CHECK-AMD: if.then.i.3.2.1: +// CHECK-AMD-NEXT: [[INC2_I_3_2_1:%.*]] = add nsw i32 [[STACKPTR_0_I_3_2_1]], 1 +// CHECK-AMD-NEXT: [[ARRAYIDX4_I_3_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[STACKPTR_0_I_3_2_1]] +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4_I_3_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END_I_3_2_1]] +// CHECK-AMD: if.end.i.3.2.1: +// CHECK-AMD-NEXT: [[STACKPTR_1_I_3_2_1:%.*]] = phi i32 [ [[INC2_I_3_2_1]], [[IF_THEN_I_3_2_1]] ], [ [[STACKPTR_0_I_3_2_1]], [[DO_BODY_I_3_2_1]] ] +// CHECK-AMD-NEXT: [[DEC_I_3_2_1]] = add nsw i32 [[STACKPTR_1_I_3_2_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6_I_3_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S1_I]], i32 0, i32 [[DEC_I_3_2_1]] +// CHECK-AMD-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6_I_3_2_1]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7_I_3_2_1:%.*]] = icmp sgt i32 [[TMP27]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7_I_3_2_1]], label [[DO_BODY_I_3_2_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_2_1]], !llvm.loop [[LOOP18]] +// CHECK-AMD: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3.2.1: +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) [[STACK_H2S1_I]]) +// CHECK-AMD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[LOCALPADDING]], i64 0, i64 [[CONV231]] +// CHECK-AMD-NEXT: store i32 [[TMP27]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP2]], [[INDVARS]] +// CHECK-AMD-NEXT: [[CONV2]] = sext i32 [[ADD18]] to i64 +// CHECK-AMD-NEXT: [[CMP_NOT:%.*]] = icmp ugt i64 [[CONV2]], [[DOTPREVIOUS_UB_]] +// CHECK-AMD-NEXT: br i1 [[CMP_NOT]], label [[OMP_LOOP_EXIT]], label [[FOR_COND_PREHEADER]] +// CHECK-AMD: omp.loop.exit: +// CHECK-AMD-NEXT: call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP0]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-AMD-NEXT: ret void +// +// +// CHECK-AMD-LABEL: define {{[^@]+}}@_Z26does_not_emit_alloc_sharedPKiPi +// CHECK-AMD-SAME: (ptr nocapture noundef readonly [[LOCALPADDING:%.*]], ptr nocapture noundef writeonly [[RES:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] { +// CHECK-AMD-NEXT: entry: +// CHECK-AMD-NEXT: [[STACK_H2S12:%.*]] = alloca [256 x i8], align 8, addrspace(5) +// CHECK-AMD-NEXT: store i32 -1, ptr addrspace(5) [[STACK_H2S12]], align 8, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: store i32 0, ptr [[RES]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[DO_BODY:%.*]] +// CHECK-AMD: do.body: +// CHECK-AMD-NEXT: [[STACKPTR_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[IF_END:%.*]] ] +// CHECK-AMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +// CHECK-AMD: if.then: +// CHECK-AMD-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S12]], i32 0, i32 [[STACKPTR_0]] +// CHECK-AMD-NEXT: [[INC2:%.*]] = add nsw i32 [[STACKPTR_0]], 1 +// CHECK-AMD-NEXT: store i32 0, ptr addrspace(5) [[ARRAYIDX4]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: br label [[IF_END]] +// CHECK-AMD: if.end: +// CHECK-AMD-NEXT: [[STACKPTR_1:%.*]] = phi i32 [ [[INC2]], [[IF_THEN]] ], [ [[STACKPTR_0]], [[DO_BODY]] ] +// CHECK-AMD-NEXT: [[DEC]] = add nsw i32 [[STACKPTR_1]], -1 +// CHECK-AMD-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(5) [[STACK_H2S12]], i32 0, i32 [[DEC]] +// CHECK-AMD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[ARRAYIDX6]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr [[RES]], align 4, !tbaa [[TBAA9]] +// CHECK-AMD-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP1]], 0 +// CHECK-AMD-NEXT: br i1 [[CMP7]], label [[DO_BODY]], label [[DO_END:%.*]], !llvm.loop [[LOOP18]] +// CHECK-AMD: do.end: +// CHECK-AMD-NEXT: ret void +// +// +// CHECK-NVIDIA-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43 +// CHECK-NVIDIA-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 noundef [[RES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NVIDIA-NEXT: entry: +// CHECK-NVIDIA-NEXT: [[DOTOMP_COMB_LB_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_COMB_UB_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_STRIDE_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_IS_LAST_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [4 x ptr], align 8 +// CHECK-NVIDIA-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1:[0-9]+]], i8 2, i1 false) #[[ATTR2:[0-9]+]] +// CHECK-NVIDIA-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +// CHECK-NVIDIA-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +// CHECK-NVIDIA: common.ret: +// CHECK-NVIDIA-NEXT: ret void +// CHECK-NVIDIA: user_code.entry: +// CHECK-NVIDIA-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_COMB_LB_I]]) #[[ATTR12:[0-9]+]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 999, ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[NVPTX_NUM_THREADS_I:%.*]] = tail call i32 @__kmpc_get_hardware_num_threads_in_block() #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @__kmpc_distribute_static_init_4(ptr nonnull @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 91, ptr nonnull [[DOTOMP_IS_LAST_I]], ptr nonnull [[DOTOMP_COMB_LB_I]], ptr nonnull [[DOTOMP_COMB_UB_I]], ptr nonnull [[DOTOMP_STRIDE_I]], i32 1, i32 [[NVPTX_NUM_THREADS_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_I]], align 4 +// CHECK-NVIDIA-NEXT: [[COND_I:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 999) +// CHECK-NVIDIA-NEXT: [[DOTOMP_IV_0_PR_I:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_I]], align 4 +// CHECK-NVIDIA-NEXT: store i32 [[COND_I]], ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP11_I:%.*]] = icmp slt i32 [[DOTOMP_IV_0_PR_I]], 1000 +// CHECK-NVIDIA-NEXT: br i1 [[CMP11_I]], label [[OMP_INNER_FOR_BODY_LR_PH_I:%.*]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined.exit +// CHECK-NVIDIA: omp.inner.for.body.lr.ph.i: +// CHECK-NVIDIA-NEXT: [[RES_CASTED_SROA_0_0_INSERT_EXT:%.*]] = and i64 [[RES]], 4294967295 +// CHECK-NVIDIA-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 1 +// CHECK-NVIDIA-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 3 +// CHECK-NVIDIA-NEXT: [[TMP6:%.*]] = inttoptr i64 [[RES_CASTED_SROA_0_0_INSERT_EXT]] to ptr +// CHECK-NVIDIA-NEXT: br label [[OMP_INNER_FOR_BODY_I:%.*]] +// CHECK-NVIDIA: omp.inner.for.body.i: +// CHECK-NVIDIA-NEXT: [[STOREMERGE3_I:%.*]] = phi i32 [ [[COND_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[COND8_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-NVIDIA-NEXT: [[DOTOMP_IV_02_I:%.*]] = phi i32 [ [[DOTOMP_IV_0_PR_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[ADD2_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-NVIDIA-NEXT: [[TMP7:%.*]] = zext i32 [[DOTOMP_IV_02_I]] to i64 +// CHECK-NVIDIA-NEXT: [[TMP8:%.*]] = zext i32 [[STOREMERGE3_I]] to i64 +// CHECK-NVIDIA-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP7]] to ptr +// CHECK-NVIDIA-NEXT: store ptr [[TMP9]], ptr [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA12:![0-9]+]] +// CHECK-NVIDIA-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr +// CHECK-NVIDIA-NEXT: store ptr [[TMP10]], ptr [[TMP3]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: store ptr [[LOCALPADDING]], ptr [[TMP4]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined_omp_outlined, ptr null, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 4) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ADD2_I]] = add nsw i32 [[TMP12]], [[TMP11]] +// CHECK-NVIDIA-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ADD3_I:%.*]] = add nsw i32 [[TMP13]], [[TMP11]] +// CHECK-NVIDIA-NEXT: [[COND8_I]] = call i32 @llvm.smin.i32(i32 [[ADD3_I]], i32 999) +// CHECK-NVIDIA-NEXT: store i32 [[COND8_I]], ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP1_I:%.*]] = icmp slt i32 [[ADD2_I]], 1000 +// CHECK-NVIDIA-NEXT: br i1 [[CMP1_I]], label [[OMP_INNER_FOR_BODY_I]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined.exit +// CHECK-NVIDIA: {{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined.exit: +// CHECK-NVIDIA-NEXT: call void @__kmpc_distribute_static_fini(ptr nonnull @[[GLOB2]], i32 [[TMP1]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_COMB_LB_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-NVIDIA-NEXT: call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: br label [[COMMON_RET]] +// +// +// CHECK-NVIDIA-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z25emits_alloc_shared_callerv_l43_omp_outlined_omp_outlined +// CHECK-NVIDIA-SAME: (ptr noalias nocapture noundef readonly [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture readnone [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr nocapture noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 [[RES:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-NVIDIA-NEXT: entry: +// CHECK-NVIDIA-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[CONV:%.*]] = trunc i64 [[DOTPREVIOUS_LB_]] to i32 +// CHECK-NVIDIA-NEXT: [[CONV1:%.*]] = trunc i64 [[DOTPREVIOUS_UB_]] to i32 +// CHECK-NVIDIA-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @__kmpc_for_static_init_4(ptr nonnull @[[GLOB3:[0-9]+]], i32 [[TMP0]], i32 33, ptr nonnull [[DOTOMP_IS_LAST]], ptr nonnull [[DOTOMP_LB]], ptr nonnull [[DOTOMP_UB]], ptr nonnull [[DOTOMP_STRIDE]], i32 1, i32 1) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CONV229:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NVIDIA-NEXT: [[CMP_NOT30:%.*]] = icmp ugt i64 [[CONV229]], [[DOTPREVIOUS_UB_]] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_NOT30]], label [[OMP_LOOP_EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]] +// CHECK-NVIDIA: for.cond.preheader: +// CHECK-NVIDIA-NEXT: [[CONV232:%.*]] = phi i64 [ [[CONV2:%.*]], [[FOR_COND_CLEANUP:%.*]] ], [ [[CONV229]], [[ENTRY:%.*]] ] +// CHECK-NVIDIA-NEXT: [[DOTOMP_IV_031:%.*]] = phi i32 [ [[ADD18:%.*]], [[FOR_COND_CLEANUP]] ], [ [[TMP1]], [[ENTRY]] ] +// CHECK-NVIDIA-NEXT: br label [[FOR_COND4_PREHEADER:%.*]] +// CHECK-NVIDIA: for.cond4.preheader: +// CHECK-NVIDIA-NEXT: [[CMP3:%.*]] = phi i1 [ true, [[FOR_COND_PREHEADER]] ], [ false, [[FOR_COND_CLEANUP6:%.*]] ] +// CHECK-NVIDIA-NEXT: br label [[FOR_COND8_PREHEADER:%.*]] +// CHECK-NVIDIA: for.cond.cleanup: +// CHECK-NVIDIA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[LOCALPADDING]], i64 0, i64 [[CONV232]] +// CHECK-NVIDIA-NEXT: store i32 [[TMP10:%.*]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ADD18]] = add nsw i32 [[TMP2]], [[DOTOMP_IV_031]] +// CHECK-NVIDIA-NEXT: [[CONV2]] = sext i32 [[ADD18]] to i64 +// CHECK-NVIDIA-NEXT: [[CMP_NOT:%.*]] = icmp ugt i64 [[CONV2]], [[DOTPREVIOUS_UB_]] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_NOT]], label [[OMP_LOOP_EXIT]], label [[FOR_COND_PREHEADER]] +// CHECK-NVIDIA: for.cond8.preheader: +// CHECK-NVIDIA-NEXT: [[HY_027:%.*]] = phi i32 [ 0, [[FOR_COND4_PREHEADER]] ], [ [[INC13:%.*]], [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3:%.*]] ] +// CHECK-NVIDIA-NEXT: [[STACK_I:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I:%.*]] +// CHECK-NVIDIA: for.cond.cleanup6: +// CHECK-NVIDIA-NEXT: br i1 [[CMP3]], label [[FOR_COND4_PREHEADER]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK-NVIDIA: do.body.i: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I:%.*]] = phi i32 [ 1, [[FOR_COND8_PREHEADER]] ], [ [[DEC_I:%.*]], [[IF_END_I:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP3]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I]] +// CHECK-NVIDIA: if.then.i: +// CHECK-NVIDIA-NEXT: [[INC2_I:%.*]] = add nsw i32 [[STACKPTR_0_I]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I:%.*]] = sext i32 [[STACKPTR_0_I]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I]], i64 0, i64 [[IDXPROM3_I]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I]] +// CHECK-NVIDIA: if.end.i: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I:%.*]] = phi i32 [ [[INC2_I]], [[IF_THEN_I]] ], [ [[STACKPTR_0_I]], [[DO_BODY_I]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I]] = add nsw i32 [[STACKPTR_1_I]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I:%.*]] = sext i32 [[DEC_I]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I]], i64 0, i64 [[IDXPROM5_I]] +// CHECK-NVIDIA-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX6_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I]]) #[[ATTR13:[0-9]+]] +// CHECK-NVIDIA-NEXT: [[CMP8_I:%.*]] = icmp sgt i32 [[TMP4]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP8_I]], label [[DO_BODY_I]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK-NVIDIA: _Z18emits_alloc_sharedPKiPi.internalized.exit: +// CHECK-NVIDIA-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I]], i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[STACK_I_1:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_I_1]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ARRAYIDX7_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_1:%.*]] +// CHECK-NVIDIA: do.body.i.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_1:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT]] ], [ [[DEC_I_1:%.*]], [[IF_END_I_1:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP5:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP_I_1:%.*]] = icmp sgt i32 [[TMP5]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I_1]], label [[IF_THEN_I_1:%.*]], label [[IF_END_I_1]] +// CHECK-NVIDIA: if.then.i.1: +// CHECK-NVIDIA-NEXT: [[INC2_I_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_1:%.*]] = sext i32 [[STACKPTR_0_I_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1]], i64 0, i64 [[IDXPROM3_I_1]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_1]] +// CHECK-NVIDIA: if.end.i.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_1:%.*]] = phi i32 [ [[INC2_I_1]], [[IF_THEN_I_1]] ], [ [[STACKPTR_0_I_1]], [[DO_BODY_I_1]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_1]] = add nsw i32 [[STACKPTR_1_I_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_1:%.*]] = sext i32 [[DEC_I_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_1]], i64 0, i64 [[IDXPROM5_I_1]] +// CHECK-NVIDIA-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX6_I_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_1]]) #[[ATTR13]] +// CHECK-NVIDIA-NEXT: [[CMP8_I_1:%.*]] = icmp sgt i32 [[TMP6]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP8_I_1]], label [[DO_BODY_I_1]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1:%.*]], !llvm.loop [[LOOP16]] +// CHECK-NVIDIA: _Z18emits_alloc_sharedPKiPi.internalized.exit.1: +// CHECK-NVIDIA-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_1]], i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[STACK_I_2:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_I_2]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ARRAYIDX7_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_2:%.*]] +// CHECK-NVIDIA: do.body.i.2: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_2:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1]] ], [ [[DEC_I_2:%.*]], [[IF_END_I_2:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP7:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP_I_2:%.*]] = icmp sgt i32 [[TMP7]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I_2]], label [[IF_THEN_I_2:%.*]], label [[IF_END_I_2]] +// CHECK-NVIDIA: if.then.i.2: +// CHECK-NVIDIA-NEXT: [[INC2_I_2:%.*]] = add nsw i32 [[STACKPTR_0_I_2]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_2:%.*]] = sext i32 [[STACKPTR_0_I_2]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2]], i64 0, i64 [[IDXPROM3_I_2]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_2]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_2]] +// CHECK-NVIDIA: if.end.i.2: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_2:%.*]] = phi i32 [ [[INC2_I_2]], [[IF_THEN_I_2]] ], [ [[STACKPTR_0_I_2]], [[DO_BODY_I_2]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_2]] = add nsw i32 [[STACKPTR_1_I_2]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_2:%.*]] = sext i32 [[DEC_I_2]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_2]], i64 0, i64 [[IDXPROM5_I_2]] +// CHECK-NVIDIA-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6_I_2]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_2]]) #[[ATTR13]] +// CHECK-NVIDIA-NEXT: [[CMP8_I_2:%.*]] = icmp sgt i32 [[TMP8]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP8_I_2]], label [[DO_BODY_I_2]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2:%.*]], !llvm.loop [[LOOP16]] +// CHECK-NVIDIA: _Z18emits_alloc_sharedPKiPi.internalized.exit.2: +// CHECK-NVIDIA-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_2]], i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[STACK_I_3:%.*]] = call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_I_3]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ARRAYIDX7_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_3:%.*]] +// CHECK-NVIDIA: do.body.i.3: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_3:%.*]] = phi i32 [ 1, [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2]] ], [ [[DEC_I_3:%.*]], [[IF_END_I_3:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP9:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP_I_3:%.*]] = icmp sgt i32 [[TMP9]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I_3]], label [[IF_THEN_I_3:%.*]], label [[IF_END_I_3]] +// CHECK-NVIDIA: if.then.i.3: +// CHECK-NVIDIA-NEXT: [[INC2_I_3:%.*]] = add nsw i32 [[STACKPTR_0_I_3]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_3:%.*]] = sext i32 [[STACKPTR_0_I_3]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3]], i64 0, i64 [[IDXPROM3_I_3]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_3]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_3]] +// CHECK-NVIDIA: if.end.i.3: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_3:%.*]] = phi i32 [ [[INC2_I_3]], [[IF_THEN_I_3]] ], [ [[STACKPTR_0_I_3]], [[DO_BODY_I_3]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_3]] = add nsw i32 [[STACKPTR_1_I_3]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_3:%.*]] = sext i32 [[DEC_I_3]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_I_3]], i64 0, i64 [[IDXPROM5_I_3]] +// CHECK-NVIDIA-NEXT: [[TMP10]] = load i32, ptr [[ARRAYIDX6_I_3]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7_I_3]]) #[[ATTR13]] +// CHECK-NVIDIA-NEXT: [[CMP8_I_3:%.*]] = icmp sgt i32 [[TMP10]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP8_I_3]], label [[DO_BODY_I_3]], label [[_Z18EMITS_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3]], !llvm.loop [[LOOP16]] +// CHECK-NVIDIA: _Z18emits_alloc_sharedPKiPi.internalized.exit.3: +// CHECK-NVIDIA-NEXT: call void @__kmpc_free_shared(ptr nonnull [[STACK_I_3]], i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[INC13]] = add nuw nsw i32 [[HY_027]], 1 +// CHECK-NVIDIA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 3 +// CHECK-NVIDIA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP6]], label [[FOR_COND8_PREHEADER]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK-NVIDIA: omp.loop.exit: +// CHECK-NVIDIA-NEXT: call void @__kmpc_distribute_static_fini(ptr nonnull @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: ret void +// +// +// CHECK-NVIDIA-LABEL: define {{[^@]+}}@_Z18emits_alloc_sharedPKiPi +// CHECK-NVIDIA-SAME: (ptr nocapture noundef readonly [[LOCALPADDING:%.*]], ptr nocapture noundef [[RES:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +// CHECK-NVIDIA-NEXT: entry: +// CHECK-NVIDIA-NEXT: [[STACK:%.*]] = tail call align 8 dereferenceable_or_null(256) ptr @__kmpc_alloc_shared(i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[RES]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: br label [[DO_BODY:%.*]] +// CHECK-NVIDIA: do.body: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[IF_END:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP0:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +// CHECK-NVIDIA: if.then: +// CHECK-NVIDIA-NEXT: [[INC2:%.*]] = add nsw i32 [[STACKPTR_0]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3:%.*]] = sext i32 [[STACKPTR_0]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK]], i64 0, i64 [[IDXPROM3]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END]] +// CHECK-NVIDIA: if.end: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1:%.*]] = phi i32 [ [[INC2]], [[IF_THEN]] ], [ [[STACKPTR_0]], [[DO_BODY]] ] +// CHECK-NVIDIA-NEXT: [[DEC]] = add nsw i32 [[STACKPTR_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5:%.*]] = sext i32 [[DEC]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK]], i64 0, i64 [[IDXPROM5]] +// CHECK-NVIDIA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: store i32 [[TMP1]], ptr [[RES]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: tail call void @_Z3fooPi(ptr noundef nonnull [[ARRAYIDX7]]) #[[ATTR13]] +// CHECK-NVIDIA-NEXT: [[TMP2:%.*]] = load i32, ptr [[RES]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP2]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP8]], label [[DO_BODY]], label [[DO_END:%.*]], !llvm.loop [[LOOP16]] +// CHECK-NVIDIA: do.end: +// CHECK-NVIDIA-NEXT: tail call void @__kmpc_free_shared(ptr nonnull [[STACK]], i64 256) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: ret void +// +// +// CHECK-NVIDIA-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77 +// CHECK-NVIDIA-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 noundef [[RES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NVIDIA-NEXT: entry: +// CHECK-NVIDIA-NEXT: [[DOTOMP_COMB_LB_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_COMB_UB_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_STRIDE_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_IS_LAST_I:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [4 x ptr], align 8 +// CHECK-NVIDIA-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +// CHECK-NVIDIA-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +// CHECK-NVIDIA: common.ret: +// CHECK-NVIDIA-NEXT: ret void +// CHECK-NVIDIA: user_code.entry: +// CHECK-NVIDIA-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_COMB_LB_I]]) #[[ATTR12]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 999, ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[NVPTX_NUM_THREADS_I:%.*]] = tail call i32 @__kmpc_get_hardware_num_threads_in_block() #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @__kmpc_distribute_static_init_4(ptr nonnull @[[GLOB2]], i32 [[TMP1]], i32 91, ptr nonnull [[DOTOMP_IS_LAST_I]], ptr nonnull [[DOTOMP_COMB_LB_I]], ptr nonnull [[DOTOMP_COMB_UB_I]], ptr nonnull [[DOTOMP_STRIDE_I]], i32 1, i32 [[NVPTX_NUM_THREADS_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_I]], align 4 +// CHECK-NVIDIA-NEXT: [[COND_I:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 999) +// CHECK-NVIDIA-NEXT: [[DOTOMP_IV_0_PR_I:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_I]], align 4 +// CHECK-NVIDIA-NEXT: store i32 [[COND_I]], ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP11_I:%.*]] = icmp slt i32 [[DOTOMP_IV_0_PR_I]], 1000 +// CHECK-NVIDIA-NEXT: br i1 [[CMP11_I]], label [[OMP_INNER_FOR_BODY_LR_PH_I:%.*]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined.exit +// CHECK-NVIDIA: omp.inner.for.body.lr.ph.i: +// CHECK-NVIDIA-NEXT: [[RES_CASTED_SROA_0_0_INSERT_EXT:%.*]] = and i64 [[RES]], 4294967295 +// CHECK-NVIDIA-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 1 +// CHECK-NVIDIA-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 2 +// CHECK-NVIDIA-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 3 +// CHECK-NVIDIA-NEXT: [[TMP6:%.*]] = inttoptr i64 [[RES_CASTED_SROA_0_0_INSERT_EXT]] to ptr +// CHECK-NVIDIA-NEXT: br label [[OMP_INNER_FOR_BODY_I:%.*]] +// CHECK-NVIDIA: omp.inner.for.body.i: +// CHECK-NVIDIA-NEXT: [[STOREMERGE3_I:%.*]] = phi i32 [ [[COND_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[COND8_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-NVIDIA-NEXT: [[DOTOMP_IV_02_I:%.*]] = phi i32 [ [[DOTOMP_IV_0_PR_I]], [[OMP_INNER_FOR_BODY_LR_PH_I]] ], [ [[ADD2_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ] +// CHECK-NVIDIA-NEXT: [[TMP7:%.*]] = zext i32 [[DOTOMP_IV_02_I]] to i64 +// CHECK-NVIDIA-NEXT: [[TMP8:%.*]] = zext i32 [[STOREMERGE3_I]] to i64 +// CHECK-NVIDIA-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP7]] to ptr +// CHECK-NVIDIA-NEXT: store ptr [[TMP9]], ptr [[CAPTURED_VARS_ADDRS_I]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr +// CHECK-NVIDIA-NEXT: store ptr [[TMP10]], ptr [[TMP3]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: store ptr [[LOCALPADDING]], ptr [[TMP4]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8, !tbaa [[TBAA12]] +// CHECK-NVIDIA-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined_omp_outlined, ptr null, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 4) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ADD2_I]] = add nsw i32 [[TMP12]], [[TMP11]] +// CHECK-NVIDIA-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_COMB_LB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[ADD3_I:%.*]] = add nsw i32 [[TMP13]], [[TMP11]] +// CHECK-NVIDIA-NEXT: [[COND8_I]] = call i32 @llvm.smin.i32(i32 [[ADD3_I]], i32 999) +// CHECK-NVIDIA-NEXT: store i32 [[COND8_I]], ptr [[DOTOMP_COMB_UB_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP1_I:%.*]] = icmp slt i32 [[ADD2_I]], 1000 +// CHECK-NVIDIA-NEXT: br i1 [[CMP1_I]], label [[OMP_INNER_FOR_BODY_I]], label %{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined.exit +// CHECK-NVIDIA: {{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined.exit: +// CHECK-NVIDIA-NEXT: call void @__kmpc_distribute_static_fini(ptr nonnull @[[GLOB2]], i32 [[TMP1]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_COMB_UB_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_COMB_LB_I]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) +// CHECK-NVIDIA-NEXT: call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: br label [[COMMON_RET]] +// +// +// CHECK-NVIDIA-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33does_not_emit_alloc_shared_callerv_l77_omp_outlined_omp_outlined +// CHECK-NVIDIA-SAME: (ptr noalias nocapture noundef readonly [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture readnone [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr nocapture noundef nonnull align 4 dereferenceable(4000) [[LOCALPADDING:%.*]], i64 [[RES:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK-NVIDIA-NEXT: entry: +// CHECK-NVIDIA-NEXT: [[STACK_H2S1_I:%.*]] = alloca [256 x i8], align 8 +// CHECK-NVIDIA-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[CONV:%.*]] = trunc i64 [[DOTPREVIOUS_LB_]] to i32 +// CHECK-NVIDIA-NEXT: [[CONV1:%.*]] = trunc i64 [[DOTPREVIOUS_UB_]] to i32 +// CHECK-NVIDIA-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: call void @__kmpc_for_static_init_4(ptr nonnull @[[GLOB3]], i32 [[TMP0]], i32 33, ptr nonnull [[DOTOMP_IS_LAST]], ptr nonnull [[DOTOMP_LB]], ptr nonnull [[DOTOMP_UB]], ptr nonnull [[DOTOMP_STRIDE]], i32 1, i32 1) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CONV228:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NVIDIA-NEXT: [[CMP_NOT29:%.*]] = icmp ugt i64 [[CONV228]], [[DOTPREVIOUS_UB_]] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_NOT29]], label [[OMP_LOOP_EXIT:%.*]], label [[FOR_COND_PREHEADER_LR_PH:%.*]] +// CHECK-NVIDIA: for.cond.preheader.lr.ph: +// CHECK-NVIDIA-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64 +// CHECK-NVIDIA-NEXT: br label [[FOR_COND_PREHEADER:%.*]] +// CHECK-NVIDIA: for.cond.preheader: +// CHECK-NVIDIA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[CONV228]], [[FOR_COND_PREHEADER_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP6_1:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP4:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP4]], 0 +// CHECK-NVIDIA-NEXT: br label [[FOR_COND8_PREHEADER:%.*]] +// CHECK-NVIDIA: for.cond8.preheader: +// CHECK-NVIDIA-NEXT: [[HY_026:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER]] ], [ [[INC13:%.*]], [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3:%.*]] ] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I:%.*]] +// CHECK-NVIDIA: for.cond8.preheader.1: +// CHECK-NVIDIA-NEXT: [[HY_026_1:%.*]] = phi i32 [ [[INC13_1:%.*]], [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1:%.*]] ], [ 0, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3]] ] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_133:%.*]] +// CHECK-NVIDIA: do.body.i.133: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_132:%.*]] = phi i32 [ 1, [[FOR_COND8_PREHEADER_1:%.*]] ], [ [[DEC_I_139:%.*]], [[IF_END_I_143:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_137:%.*]], label [[IF_END_I_143]] +// CHECK-NVIDIA: if.then.i.137: +// CHECK-NVIDIA-NEXT: [[INC2_I_134:%.*]] = add nsw i32 [[STACKPTR_0_I_132]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_135:%.*]] = sext i32 [[STACKPTR_0_I_132]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_136:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_135]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_136]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_143]] +// CHECK-NVIDIA: if.end.i.143: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_138:%.*]] = phi i32 [ [[INC2_I_134]], [[IF_THEN_I_137]] ], [ [[STACKPTR_0_I_132]], [[DO_BODY_I_133]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_139]] = add nsw i32 [[STACKPTR_1_I_138]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_140:%.*]] = sext i32 [[DEC_I_139]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_141:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_140]] +// CHECK-NVIDIA-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX6_I_141]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_142:%.*]] = icmp sgt i32 [[TMP5]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_142]], label [[DO_BODY_I_133]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_144:%.*]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.144: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_1_1:%.*]] +// CHECK-NVIDIA: do.body.i.1.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_1_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_144]] ], [ [[DEC_I_1_1:%.*]], [[IF_END_I_1_1:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1_1:%.*]], label [[IF_END_I_1_1]] +// CHECK-NVIDIA: if.then.i.1.1: +// CHECK-NVIDIA-NEXT: [[INC2_I_1_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1_1]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_1_1:%.*]] = sext i32 [[STACKPTR_0_I_1_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_1_1]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_1_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_1_1]] +// CHECK-NVIDIA: if.end.i.1.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_1_1:%.*]] = phi i32 [ [[INC2_I_1_1]], [[IF_THEN_I_1_1]] ], [ [[STACKPTR_0_I_1_1]], [[DO_BODY_I_1_1]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_1_1]] = add nsw i32 [[STACKPTR_1_I_1_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_1_1:%.*]] = sext i32 [[DEC_I_1_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_1_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_1_1]] +// CHECK-NVIDIA-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX6_I_1_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_1_1:%.*]] = icmp sgt i32 [[TMP6]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_1_1]], label [[DO_BODY_I_1_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1.1: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_2_1:%.*]] +// CHECK-NVIDIA: do.body.i.2.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_2_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1_1]] ], [ [[DEC_I_2_1:%.*]], [[IF_END_I_2_1:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2_1:%.*]], label [[IF_END_I_2_1]] +// CHECK-NVIDIA: if.then.i.2.1: +// CHECK-NVIDIA-NEXT: [[INC2_I_2_1:%.*]] = add nsw i32 [[STACKPTR_0_I_2_1]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_2_1:%.*]] = sext i32 [[STACKPTR_0_I_2_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_2_1]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_2_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_2_1]] +// CHECK-NVIDIA: if.end.i.2.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_2_1:%.*]] = phi i32 [ [[INC2_I_2_1]], [[IF_THEN_I_2_1]] ], [ [[STACKPTR_0_I_2_1]], [[DO_BODY_I_2_1]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_2_1]] = add nsw i32 [[STACKPTR_1_I_2_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_2_1:%.*]] = sext i32 [[DEC_I_2_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_2_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_2_1]] +// CHECK-NVIDIA-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX6_I_2_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_2_1:%.*]] = icmp sgt i32 [[TMP7]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_2_1]], label [[DO_BODY_I_2_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2.1: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_3_1:%.*]] +// CHECK-NVIDIA: do.body.i.3.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_3_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2_1]] ], [ [[DEC_I_3_1:%.*]], [[IF_END_I_3_1:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3_1:%.*]], label [[IF_END_I_3_1]] +// CHECK-NVIDIA: if.then.i.3.1: +// CHECK-NVIDIA-NEXT: [[INC2_I_3_1:%.*]] = add nsw i32 [[STACKPTR_0_I_3_1]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_3_1:%.*]] = sext i32 [[STACKPTR_0_I_3_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_3_1]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_3_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_3_1]] +// CHECK-NVIDIA: if.end.i.3.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_3_1:%.*]] = phi i32 [ [[INC2_I_3_1]], [[IF_THEN_I_3_1]] ], [ [[STACKPTR_0_I_3_1]], [[DO_BODY_I_3_1]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_3_1]] = add nsw i32 [[STACKPTR_1_I_3_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_3_1:%.*]] = sext i32 [[DEC_I_3_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_3_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_3_1]] +// CHECK-NVIDIA-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6_I_3_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_3_1:%.*]] = icmp sgt i32 [[TMP8]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_3_1]], label [[DO_BODY_I_3_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3_1]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3.1: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: [[INC13_1]] = add nuw nsw i32 [[HY_026_1]], 1 +// CHECK-NVIDIA-NEXT: [[EXITCOND_1_NOT:%.*]] = icmp eq i32 [[INC13_1]], 3 +// CHECK-NVIDIA-NEXT: br i1 [[EXITCOND_1_NOT]], label [[FOR_COND_CLEANUP6_1]], label [[FOR_COND8_PREHEADER_1]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK-NVIDIA: for.cond.cleanup6.1: +// CHECK-NVIDIA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[LOCALPADDING]], i64 0, i64 [[INDVARS_IV]] +// CHECK-NVIDIA-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP3]] +// CHECK-NVIDIA-NEXT: [[CMP_NOT:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], [[DOTPREVIOUS_UB_]] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_NOT]], label [[OMP_LOOP_EXIT]], label [[FOR_COND_PREHEADER]] +// CHECK-NVIDIA: do.body.i: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I:%.*]] = phi i32 [ 1, [[FOR_COND8_PREHEADER]] ], [ [[DEC_I:%.*]], [[IF_END_I:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I]] +// CHECK-NVIDIA: if.then.i: +// CHECK-NVIDIA-NEXT: [[INC2_I:%.*]] = add nsw i32 [[STACKPTR_0_I]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I:%.*]] = sext i32 [[STACKPTR_0_I]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I]] +// CHECK-NVIDIA: if.end.i: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I:%.*]] = phi i32 [ [[INC2_I]], [[IF_THEN_I]] ], [ [[STACKPTR_0_I]], [[DO_BODY_I]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I]] = add nsw i32 [[STACKPTR_1_I]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I:%.*]] = sext i32 [[DEC_I]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I]] +// CHECK-NVIDIA-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I:%.*]] = icmp sgt i32 [[TMP9]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I]], label [[DO_BODY_I]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT:%.*]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_1:%.*]] +// CHECK-NVIDIA: do.body.i.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_1:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT]] ], [ [[DEC_I_1:%.*]], [[IF_END_I_1:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_1:%.*]], label [[IF_END_I_1]] +// CHECK-NVIDIA: if.then.i.1: +// CHECK-NVIDIA-NEXT: [[INC2_I_1:%.*]] = add nsw i32 [[STACKPTR_0_I_1]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_1:%.*]] = sext i32 [[STACKPTR_0_I_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_1]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_1]] +// CHECK-NVIDIA: if.end.i.1: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_1:%.*]] = phi i32 [ [[INC2_I_1]], [[IF_THEN_I_1]] ], [ [[STACKPTR_0_I_1]], [[DO_BODY_I_1]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_1]] = add nsw i32 [[STACKPTR_1_I_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_1:%.*]] = sext i32 [[DEC_I_1]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_1:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_1]] +// CHECK-NVIDIA-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6_I_1]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_1:%.*]] = icmp sgt i32 [[TMP10]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_1]], label [[DO_BODY_I_1]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1:%.*]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.1: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_2:%.*]] +// CHECK-NVIDIA: do.body.i.2: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_2:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_1]] ], [ [[DEC_I_2:%.*]], [[IF_END_I_2:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_2:%.*]], label [[IF_END_I_2]] +// CHECK-NVIDIA: if.then.i.2: +// CHECK-NVIDIA-NEXT: [[INC2_I_2:%.*]] = add nsw i32 [[STACKPTR_0_I_2]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_2:%.*]] = sext i32 [[STACKPTR_0_I_2]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_2]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_2]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_2]] +// CHECK-NVIDIA: if.end.i.2: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_2:%.*]] = phi i32 [ [[INC2_I_2]], [[IF_THEN_I_2]] ], [ [[STACKPTR_0_I_2]], [[DO_BODY_I_2]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_2]] = add nsw i32 [[STACKPTR_1_I_2]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_2:%.*]] = sext i32 [[DEC_I_2]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_2:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_2]] +// CHECK-NVIDIA-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6_I_2]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_2:%.*]] = icmp sgt i32 [[TMP11]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_2]], label [[DO_BODY_I_2]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2:%.*]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.2: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S1_I]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY_I_3:%.*]] +// CHECK-NVIDIA: do.body.i.3: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0_I_3:%.*]] = phi i32 [ 1, [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_2]] ], [ [[DEC_I_3:%.*]], [[IF_END_I_3:%.*]] ] +// CHECK-NVIDIA-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I_3:%.*]], label [[IF_END_I_3]] +// CHECK-NVIDIA: if.then.i.3: +// CHECK-NVIDIA-NEXT: [[INC2_I_3:%.*]] = add nsw i32 [[STACKPTR_0_I_3]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3_I_3:%.*]] = sext i32 [[STACKPTR_0_I_3]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM3_I_3]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4_I_3]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END_I_3]] +// CHECK-NVIDIA: if.end.i.3: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1_I_3:%.*]] = phi i32 [ [[INC2_I_3]], [[IF_THEN_I_3]] ], [ [[STACKPTR_0_I_3]], [[DO_BODY_I_3]] ] +// CHECK-NVIDIA-NEXT: [[DEC_I_3]] = add nsw i32 [[STACKPTR_1_I_3]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5_I_3:%.*]] = sext i32 [[DEC_I_3]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6_I_3:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S1_I]], i64 0, i64 [[IDXPROM5_I_3]] +// CHECK-NVIDIA-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX6_I_3]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7_I_3:%.*]] = icmp sgt i32 [[TMP12]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7_I_3]], label [[DO_BODY_I_3]], label [[_Z26DOES_NOT_EMIT_ALLOC_SHAREDPKIPI_INTERNALIZED_EXIT_3]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: _Z26does_not_emit_alloc_sharedPKiPi.internalized.exit.3: +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr nonnull [[STACK_H2S1_I]]) +// CHECK-NVIDIA-NEXT: [[INC13]] = add nuw nsw i32 [[HY_026]], 1 +// CHECK-NVIDIA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 3 +// CHECK-NVIDIA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND8_PREHEADER_1]], label [[FOR_COND8_PREHEADER]], !llvm.loop [[LOOP19]] +// CHECK-NVIDIA: omp.loop.exit: +// CHECK-NVIDIA-NEXT: call void @__kmpc_distribute_static_fini(ptr nonnull @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_STRIDE]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_UB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[DOTOMP_LB]]) #[[ATTR2]] +// CHECK-NVIDIA-NEXT: ret void +// +// +// CHECK-NVIDIA-LABEL: define {{[^@]+}}@_Z26does_not_emit_alloc_sharedPKiPi +// CHECK-NVIDIA-SAME: (ptr nocapture noundef readonly [[LOCALPADDING:%.*]], ptr nocapture noundef writeonly [[RES:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] { +// CHECK-NVIDIA-NEXT: entry: +// CHECK-NVIDIA-NEXT: [[STACK_H2S12:%.*]] = alloca [256 x i8], align 8 +// CHECK-NVIDIA-NEXT: store i32 -1, ptr [[STACK_H2S12]], align 8, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[RES]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[DO_BODY:%.*]] +// CHECK-NVIDIA: do.body: +// CHECK-NVIDIA-NEXT: [[STACKPTR_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[IF_END:%.*]] ] +// CHECK-NVIDIA-NEXT: [[TMP0:%.*]] = load i32, ptr [[LOCALPADDING]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +// CHECK-NVIDIA: if.then: +// CHECK-NVIDIA-NEXT: [[INC2:%.*]] = add nsw i32 [[STACKPTR_0]], 1 +// CHECK-NVIDIA-NEXT: [[IDXPROM3:%.*]] = sext i32 [[STACKPTR_0]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S12]], i64 0, i64 [[IDXPROM3]] +// CHECK-NVIDIA-NEXT: store i32 0, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: br label [[IF_END]] +// CHECK-NVIDIA: if.end: +// CHECK-NVIDIA-NEXT: [[STACKPTR_1:%.*]] = phi i32 [ [[INC2]], [[IF_THEN]] ], [ [[STACKPTR_0]], [[DO_BODY]] ] +// CHECK-NVIDIA-NEXT: [[DEC]] = add nsw i32 [[STACKPTR_1]], -1 +// CHECK-NVIDIA-NEXT: [[IDXPROM5:%.*]] = sext i32 [[DEC]] to i64 +// CHECK-NVIDIA-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [64 x i32], ptr [[STACK_H2S12]], i64 0, i64 [[IDXPROM5]] +// CHECK-NVIDIA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: store i32 [[TMP1]], ptr [[RES]], align 4, !tbaa [[TBAA8]] +// CHECK-NVIDIA-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP1]], 0 +// CHECK-NVIDIA-NEXT: br i1 [[CMP7]], label [[DO_BODY]], label [[DO_END:%.*]], !llvm.loop [[LOOP18]] +// CHECK-NVIDIA: do.end: +// CHECK-NVIDIA-NEXT: ret void +// diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -7180,7 +7180,8 @@ } std::optional Size = getSize(A, *this, AI); - if (MaxHeapToStackSize != -1) { + if (AI.LibraryFunctionId != LibFunc___kmpc_alloc_shared && + MaxHeapToStackSize != -1) { if (!Size || Size->ugt(MaxHeapToStackSize)) { LLVM_DEBUG({ if (!Size)